From ad2d2f9e3d6f8787220df365c3dd2f41d78fad21 Mon Sep 17 00:00:00 2001
From: David Goodwin <davidg@nvidia.com>
Date: Fri, 5 Feb 2021 14:07:06 -0800
Subject: [PATCH] Initial backend template based on onnxruntime_backend

---
 CMakeLists.txt                             |  235 +++
 LICENSE                                    |   25 +
 README.md                                  |   39 +-
 cmake/TritonOpenVINOBackendConfig.cmake.in |   39 +
 src/libtriton_openvino.ldscript            |   30 +
 src/openvino.cc                            | 1839 ++++++++++++++++++++
 tools/gen_openvino_dockerfile.py           |  126 ++
 7 files changed, 2332 insertions(+), 1 deletion(-)
 create mode 100644 CMakeLists.txt
 create mode 100644 LICENSE
 create mode 100644 cmake/TritonOpenVINOBackendConfig.cmake.in
 create mode 100644 src/libtriton_openvino.ldscript
 create mode 100644 src/openvino.cc
 create mode 100755 tools/gen_openvino_dockerfile.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..687ec08
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,235 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required(VERSION 3.17)
+
+project(tritonopenvinobackend LANGUAGES C CXX)
+
+#
+# Set TRITON_BUILD_OPENVINO_VERSION to the version of OpenVINO that
+# you want to be built for the backend. Set
+# TRITON_BUILD_CONTAINER_VERSION to the Triton container version that
+# you want to target with the build.
+#
+option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
+set(TRITON_BUILD_CONTAINER_VERSION "" CACHE STRING "Triton container version to build for")
+set(TRITON_BUILD_OPENVINO_VERSION "" CACHE STRING "OpenVINO version to build")
+
+set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
+set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
+set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+if(NOT TRITON_BUILD_OPENVINO_VERSION)
+  message(FATAL_ERROR "TRITON_BUILD_OPENVINO_VERSION is required")
+endif()
+if(NOT TRITON_BUILD_CONTAINER_VERSION)
+  message(FATAL_ERROR "TRITON_BUILD_OPENVINO_VERSION requires TRITON_BUILD_CONTAINER_VERSION")
+endif()
+
+set(TRITON_OPENVINO_INCLUDE_PATHS "${CMAKE_CURRENT_BINARY_DIR}/openvino/include")
+set(TRITON_OPENVINO_LIB_PATHS "${CMAKE_CURRENT_BINARY_DIR}/openvino/lib")
+set(TRITON_OPENVINO_DOCKER_IMAGE "tritonserver_openvino")
+set(OPENVINO_LIBRARY "libinference_engine.so")
+
+#
+# Dependencies
+#
+# FetchContent's composibility isn't very good. We must include the
+# transitive closure of all repos so that we can override the tag.
+#
+include(FetchContent)
+
+FetchContent_Declare(
+  repo-common
+  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
+  GIT_TAG ${TRITON_COMMON_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-core
+  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
+  GIT_TAG ${TRITON_CORE_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-backend
+  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
+  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_MakeAvailable(repo-common repo-core repo-backend)
+
+#
+# Shared library implementing the Triton Backend API
+#
+configure_file(src/libtriton_openvino.ldscript libtriton_openvino.ldscript COPYONLY)
+
+add_library(
+  triton-openvino-backend SHARED
+  src/openvino.cc
+)
+
+add_library(
+  TritonOpenVINOBackend::triton-openvino-backend ALIAS triton-openvino-backend
+)
+
+target_include_directories(
+  triton-openvino-backend
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+    ${TRITON_OPENVINO_INCLUDE_PATHS}
+)
+
+target_compile_features(triton-openvino-backend PRIVATE cxx_std_11)
+target_compile_options(
+  triton-openvino-backend PRIVATE
+  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
+    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>
+  $<$<CXX_COMPILER_ID:MSVC>:/Wall /D_WIN32_WINNT=0x0A00 /EHsc>
+)
+
+set_target_properties(
+  triton-openvino-backend
+  PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_openvino
+    SKIP_BUILD_RPATH TRUE
+    BUILD_WITH_INSTALL_RPATH TRUE
+    INSTALL_RPATH_USE_LINK_PATH FALSE
+    INSTALL_RPATH "$\{ORIGIN\}"
+    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_openvino.ldscript
+    LINK_FLAGS "-Wl,--version-script libtriton_openvino.ldscript"
+)
+
+FOREACH(p ${TRITON_OPENVINO_LIB_PATHS})
+  set(TRITON_OPENVINO_LDFLAGS ${TRITON_OPENVINO_LDFLAGS} "-L${p}")
+ENDFOREACH(p)
+
+target_link_libraries(
+  triton-openvino-backend
+  PRIVATE
+    triton-core-serverapi   # from repo-core
+    triton-core-backendapi  # from repo-core
+    triton-core-serverstub  # from repo-core
+    triton-backend-utils    # from repo-backend
+    ${TRITON_OPENVINO_LDFLAGS}
+    ${OPENVINO_LIBRARY}
+)
+
+#
+# Build the OpenVINO libraries using docker.
+#
+add_custom_command(
+  OUTPUT
+    openvino/lib/${OPENVINO_LIBRARY}
+  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/tools/gen_openvino_dockerfile.py --triton-container="nvcr.io/nvidia/tritonserver:${TRITON_BUILD_CONTAINER_VERSION}-py3-min" --openvino-version="${TRITON_BUILD_OPENVINO_VERSION}" --output=Dockerfile.openvino
+  COMMAND docker build --cache-from=${TRITON_OPENVINO_DOCKER_IMAGE} --cache-from=${TRITON_OPENVINO_DOCKER_IMAGE}_cache0 --cache-from=${TRITON_OPENVINO_DOCKER_IMAGE}_cache1 -t ${TRITON_OPENVINO_DOCKER_IMAGE} -f ./Dockerfile.openvino ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND docker rm openvino_backend_ov || echo "error ignored..." || true
+  COMMAND docker create --name openvino_backend_ov ${TRITON_OPENVINO_DOCKER_IMAGE}
+  COMMAND rm -fr openvino
+  COMMAND docker cp openvino_backend_ov:/opt/openvino openvino
+  COMMAND docker rm openvino_backend_ov
+  COMMENT "Building OpenVino"
+)
+add_custom_target(ov_target DEPENDS openvino/lib/${OPENVINO_LIBRARY})
+add_library(openvino-library SHARED IMPORTED GLOBAL)
+add_dependencies(openvino-library ov_target)
+add_dependencies(triton-openvino-backend openvino-library)
+set_target_properties(
+  openvino-library
+  PROPERTIES
+    IMPORTED_LOCATION openvino/lib/${OPENVINO_LIBRARY}
+)
+
+#
+# Install
+#
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonOpenVINOBackend)
+
+install(
+  TARGETS
+    triton-openvino-backend
+  EXPORT
+    triton-openvino-backend-targets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/openvino
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/openvino
+)
+
+install(
+  DIRECTORY
+    ${CMAKE_CURRENT_BINARY_DIR}/openvino/
+  DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/openvino
+  PATTERN *lib EXCLUDE
+  PATTERN *bin EXCLUDE
+  PATTERN *include EXCLUDE
+)
+
+install(
+  DIRECTORY
+    ${CMAKE_CURRENT_BINARY_DIR}/openvino/lib/
+  USE_SOURCE_PERMISSIONS
+  DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/openvino
+)
+
+install(
+  EXPORT
+    triton-openvino-backend-targets
+  FILE
+    TritonOpenVINOBackendTargets.cmake
+  NAMESPACE
+    TritonOpenVINOBackend::
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonOpenVINOBackendConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/TritonOpenVINOBackendConfig.cmake
+  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/TritonOpenVINOBackendConfig.cmake
+  DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+#
+# Export from build tree
+#
+export(
+  EXPORT triton-openvino-backend-targets
+  FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonOpenVINOBackendTargets.cmake
+  NAMESPACE TritonOpenVINOBackend::
+)
+
+export(PACKAGE TritonOpenVINOBackend)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..e8584b9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,25 @@
+Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
index 8a09e54..62d2826 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,38 @@
-# openvino_backend
\ No newline at end of file
+<!--
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
+
+# OpenVINO Backend
+
+The Triton backend for the
+[OpenVINO](https://docs.openvinotoolkit.org/latest/index.html). You
+can learn more about Triton backends in the [backend
+repo](https://github.com/triton-inference-server/backend).  Ask
+questions or report problems in the main Triton [issues
+page](https://github.com/triton-inference-server/server/issues).
diff --git a/cmake/TritonOpenVINOBackendConfig.cmake.in b/cmake/TritonOpenVINOBackendConfig.cmake.in
new file mode 100644
index 0000000..d7c0b7f
--- /dev/null
+++ b/cmake/TritonOpenVINOBackendConfig.cmake.in
@@ -0,0 +1,39 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include(CMakeFindDependencyMacro)
+
+get_filename_component(
+  TRITONOPENVINOBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
+)
+
+list(APPEND CMAKE_MODULE_PATH ${TRITONOPENVINOBACKEND_CMAKE_DIR})
+
+if(NOT TARGET TritonOpenVINOBackend::triton-openvino-backend)
+  include("${TRITONOPENVINOBACKEND_CMAKE_DIR}/TritonOpenVINOBackendTargets.cmake")
+endif()
+
+set(TRITONOPENVINOBACKEND_LIBRARIES TritonOpenVINOBackend::triton-openvino-backend)
diff --git a/src/libtriton_openvino.ldscript b/src/libtriton_openvino.ldscript
new file mode 100644
index 0000000..61e9a06
--- /dev/null
+++ b/src/libtriton_openvino.ldscript
@@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+{
+  global:
+    TRITONBACKEND_*;
+  local: *;
+};
diff --git a/src/openvino.cc b/src/openvino.cc
new file mode 100644
index 0000000..4cafadb
--- /dev/null
+++ b/src/openvino.cc
@@ -0,0 +1,1839 @@
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <stdint.h>
+#include <mutex>
+#include <vector>
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_input_collector.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/backend/backend_output_responder.h"
+
+//
+// OpenVINO Backend that implements the TRITONBACKEND API.
+//
+
+namespace triton { namespace backend { namespace openvino {
+
+//
+// ModelState
+//
+// State associated with a model that is using this backend. An object
+// of this class is created and associated with each
+// TRITONBACKEND_Model.
+//
+class ModelState : public BackendModel {
+ public:
+  static TRITONSERVER_Error* Create(
+      TRITONBACKEND_Model* triton_model, ModelState** state);
+  virtual ~ModelState() = default;
+
+#if 0
+  // Load an OpenVINNO model using 'artifact_name' as the name for the
+  // model file/directory. If 'instance_group_kind' is not
+  // TRITONSERVER_INSTANCEGROUPKIND_AUTO then use it and
+  // 'instance_group_device_id' to initialize the appropriate
+  // execution providers. Return in 'model_path' the full path to the
+  // model file, return in 'session' and 'allocator' the ORT session
+  // and allocator.
+  TRITONSERVER_Error* LoadModel(
+      const std::string& artifact_name,
+      const TRITONSERVER_InstanceGroupKind instance_group_kind,
+      const int32_t instance_group_device_id, std::string* model_path,
+      OrtSession** session, OrtAllocator** allocator);
+#endif
+
+ private:
+  ModelState(TRITONBACKEND_Model* triton_model);
+#if 0
+  TRITONSERVER_Error* AutoCompleteConfig();
+  TRITONSERVER_Error* AutoCompleteMaxBatch(
+      const OnnxTensorInfoMap& input_tensor_infos,
+      const OnnxTensorInfoMap& output_tensor_infos);
+  TRITONSERVER_Error* AutoCompleteIO(
+      const char* key, const OnnxTensorInfoMap& io_infos);
+#endif
+};
+
+TRITONSERVER_Error*
+ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
+{
+  try {
+    *state = new ModelState(triton_model);
+  }
+  catch (const BackendModelException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
+#if 0
+  // Auto-complete the configuration if requested...
+  bool auto_complete_config = false;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(
+      triton_model, &auto_complete_config));
+  if (auto_complete_config) {
+    RETURN_IF_ERROR((*state)->AutoCompleteConfig());
+
+    triton::common::TritonJson::WriteBuffer json_buffer;
+    (*state)->ModelConfig().Write(&json_buffer);
+
+    TRITONSERVER_Message* message;
+    RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson(
+        &message, json_buffer.Base(), json_buffer.Size()));
+    RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig(
+        triton_model, 1 /* config_version */, message));
+  }
+#endif
+
+  return nullptr;  // success
+}
+
+ModelState::ModelState(TRITONBACKEND_Model* triton_model)
+    : BackendModel(triton_model)
+{
+#if 0
+// Create session options that will be cloned and used for each
+  // instance when creating that instance's session.
+  OrtSessionOptions* soptions;
+  THROW_IF_BACKEND_MODEL_ORT_ERROR(ort_api->CreateSessionOptions(&soptions));
+  session_options_.reset(soptions);
+
+  THROW_IF_BACKEND_MODEL_ORT_ERROR(ort_api->SetIntraOpNumThreads(soptions, 1));
+
+  GraphOptimizationLevel optimization_level =
+      GraphOptimizationLevel::ORT_ENABLE_ALL;
+  {
+    triton::common::TritonJson::Value optimization;
+    if (ModelConfig().Find("optimization", &optimization)) {
+      triton::common::TritonJson::Value graph;
+      if (optimization.Find("graph", &graph)) {
+        int64_t graph_level = 0;
+        THROW_IF_BACKEND_MODEL_ERROR(graph.MemberAsInt("level", &graph_level));
+        if (graph_level == -1) {
+          optimization_level = GraphOptimizationLevel::ORT_ENABLE_BASIC;
+        } else if (graph_level == 1) {
+          optimization_level = GraphOptimizationLevel::ORT_ENABLE_EXTENDED;
+        }
+      }
+    }
+  }
+  THROW_IF_BACKEND_MODEL_ORT_ERROR(
+      ort_api->SetSessionGraphOptimizationLevel(soptions, optimization_level));
+
+  // FIXME. Is it possible to share a single OrtSession across
+  // multiple instances? If so then should move loading and validation
+  // of the session to here instead of creating a session for each
+  // instance in ModelStateInstance::Create().
+#endif
+}
+
+#if 0
+TRITONSERVER_Error*
+ModelState::LoadModel(
+    const std::string& artifact_name,
+    const TRITONSERVER_InstanceGroupKind instance_group_kind,
+    const int32_t instance_group_device_id, std::string* model_path,
+    OrtSession** session, OrtAllocator** allocator)
+{
+  // Find the ONNX file that describes the model itself. If the model
+  // configuration doesn't have an explicit model file specified then
+  // use the default name ("model.onnx").
+  std::string cc_model_filename = artifact_name;
+  if (cc_model_filename.empty()) {
+    cc_model_filename = "model.onnx";
+  }
+
+  *model_path = JoinPath(
+      {RepositoryPath(), std::to_string(Version()), cc_model_filename});
+
+  // If the model path is a directory then the actual model is
+  // <dir>/model.onnx.
+  {
+    bool is_dir;
+    RETURN_IF_ERROR(IsDirectory(*model_path, &is_dir));
+    if (is_dir) {
+      *model_path = JoinPath({*model_path, "model.onnx"});
+    }
+  }
+
+  {
+    bool exists;
+    RETURN_IF_ERROR(FileExists(*model_path, &exists));
+    RETURN_ERROR_IF_FALSE(
+        exists, TRITONSERVER_ERROR_UNAVAILABLE,
+        std::string("unable to find '") + *model_path +
+            "' for model instance '" + Name() + "'");
+  }
+
+  // Make a clone for the session options for this instance...
+  OrtSessionOptions* soptions;
+  RETURN_IF_ORT_ERROR(
+      ort_api->CloneSessionOptions(session_options_.get(), &soptions));
+  std::unique_ptr<OrtSessionOptions, SessionOptionsDeleter> soptions_wrapper(
+      soptions);
+
+  bool need_lock = false;
+
+  // Execution providers if they are requested... kind == AUTO if used
+  // to indicate that execution providers should not be added (this is
+  // just a local convention to this function, not the standard
+  // interpretation of AUTO).
+  if (instance_group_kind != TRITONSERVER_INSTANCEGROUPKIND_AUTO) {
+    // Don't need to ensure uniqueness of the providers, ONNX Runtime
+    // will check it.
+
+    // CPU execution providers
+    {
+      triton::common::TritonJson::Value optimization;
+      if (model_config_.Find("optimization", &optimization)) {
+        triton::common::TritonJson::Value eas;
+        if (optimization.Find("execution_accelerators", &eas)) {
+          triton::common::TritonJson::Value cpu_eas;
+          if (eas.Find("cpu_execution_accelerator", &cpu_eas)) {
+            for (size_t ea_idx = 0; ea_idx < cpu_eas.ArraySize(); ea_idx++) {
+              triton::common::TritonJson::Value ea;
+              RETURN_IF_ERROR(cpu_eas.IndexAsObject(ea_idx, &ea));
+              std::string name;
+              RETURN_IF_ERROR(ea.MemberAsString("name", &name));
+#ifdef TRITON_ENABLE_ONNXRUNTIME_OPENVINO
+              if (name == kOpenVINOExecutionAccelerator) {
+                need_lock = true;
+                RETURN_IF_ORT_ERROR(
+                    OrtSessionOptionsAppendExecutionProvider_OpenVINO(
+                        soptions, ""));
+                LOG_MESSAGE(
+                    TRITONSERVER_LOG_VERBOSE,
+                    (std::string(
+                         "OpenVINO Execution Accelerator is set for '") +
+                     Name() + "' on CPU")
+                        .c_str());
+                continue;
+              }
+#endif  // TRITON_ENABLE_ONNXRUNTIME_OPENVINO
+              return TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ERROR_INVALID_ARG,
+                  (std::string("unknown Execution Accelerator '") + name +
+                   "' is requested")
+                      .c_str());
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Register all op libraries that contain custom operations.
+  {
+    triton::common::TritonJson::Value model_ops;
+    if (model_config_.Find("model_operations", &model_ops)) {
+      triton::common::TritonJson::Value op_library_filenames;
+      if (model_ops.Find("op_library_filename", &op_library_filenames)) {
+        for (size_t op_idx = 0; op_idx < op_library_filenames.ArraySize();
+             op_idx++) {
+          std::string op_filename;
+          RETURN_IF_ERROR(
+              op_library_filenames.IndexAsString(op_idx, &op_filename));
+          void* library_handle = nullptr;
+          RETURN_IF_ORT_ERROR(ort_api->RegisterCustomOpsLibrary(
+              soptions, op_filename.c_str(), &library_handle));
+        }
+      }
+    }
+  }
+
+  // ONNX session creation with OpenVINO is not thread-safe,
+  // so multiple creations are serialized with a global lock.
+  static std::mutex global_context_mu;
+  std::unique_lock<std::mutex> glock(global_context_mu, std::defer_lock);
+  if (need_lock) {
+    glock.lock();
+  }
+
+  RETURN_IF_ERROR(OnnxLoader::LoadSession(
+      true /* is_path */, *model_path, soptions, session));
+  RETURN_IF_ORT_ERROR(ort_api->GetAllocatorWithDefaultOptions(allocator));
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelState::AutoCompleteConfig()
+{
+  // If the model configuration already specifies inputs and outputs
+  // then don't perform any auto-completion.
+  size_t input_cnt = 0;
+  size_t output_cnt = 0;
+  {
+    triton::common::TritonJson::Value inputs;
+    if (ModelConfig().Find("input", &inputs)) {
+      input_cnt = inputs.ArraySize();
+    }
+    triton::common::TritonJson::Value outputs;
+    if (ModelConfig().Find("output", &outputs)) {
+      output_cnt = outputs.ArraySize();
+    }
+  }
+
+  if ((input_cnt > 0) && (output_cnt > 0)) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("skipping model configuration auto-complete for '") +
+         Name() + "': inputs and outputs already specified")
+            .c_str());
+    return nullptr;  // success
+  }
+
+  std::string artifact_name;
+  RETURN_IF_ERROR(
+      ModelConfig().MemberAsString("default_model_filename", &artifact_name));
+
+  // Must cleanup 'session'.  'allocator' is default allocator which
+  // is managed by ONNX Runtime so don't need to free/release
+  std::unique_ptr<OrtSession, SessionDeleter> session;
+  OrtAllocator* allocator;
+  std::string model_path;
+  {
+    OrtSession* sptr = nullptr;
+    RETURN_IF_ERROR(LoadModel(
+        artifact_name, TRITONSERVER_INSTANCEGROUPKIND_AUTO, 0, &model_path,
+        &sptr, &allocator));
+    session.reset(sptr);
+  }
+
+  OnnxTensorInfoMap input_tensor_infos;
+  RETURN_IF_ERROR(InputInfos(session.get(), allocator, input_tensor_infos));
+  OnnxTensorInfoMap output_tensor_infos;
+  RETURN_IF_ERROR(OutputInfos(session.get(), allocator, output_tensor_infos));
+
+  RETURN_IF_ERROR(
+      AutoCompleteMaxBatch(input_tensor_infos, output_tensor_infos));
+  if (input_cnt == 0) {
+    RETURN_IF_ERROR(AutoCompleteIO("input", input_tensor_infos));
+  }
+  if (output_cnt == 0) {
+    RETURN_IF_ERROR(AutoCompleteIO("output", output_tensor_infos));
+  }
+
+  if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
+    triton::common::TritonJson::WriteBuffer buffer;
+    RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer));
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("post auto-complete:\n") + buffer.Contents()).c_str());
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelState::AutoCompleteMaxBatch(
+    const OnnxTensorInfoMap& input_tensor_infos,
+    const OnnxTensorInfoMap& output_tensor_infos)
+{
+  // Determine if the model can potentially support batching. All
+  // input and output tensors must have a variable first dimension.
+  bool can_support_batching = true;
+  for (const auto& io_info : input_tensor_infos) {
+    const auto& dims = io_info.second.dims_;
+    if ((dims.size() == 0) || (dims[0] != -1)) {
+      can_support_batching = false;
+    }
+  }
+  for (const auto& io_info : output_tensor_infos) {
+    const auto& dims = io_info.second.dims_;
+    if ((dims.size() == 0) || (dims[0] != -1)) {
+      can_support_batching = false;
+    }
+  }
+
+  // Set max-batch-size to 1 if we have determined that batching is
+  // supported and max-batch-size is not specified. We need to update
+  // the configuration itself as well as the cached value we have already
+  // initialized in the model state.
+  if (can_support_batching) {
+    if (MaxBatchSize() == 0) {
+      triton::common::TritonJson::Value mbs_value;
+      ModelConfig().Find("max_batch_size", &mbs_value);
+      mbs_value.SetInt(1);
+      SetMaxBatchSize(1);
+    }
+  } else if (MaxBatchSize() != 0) {
+    return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string("autofill failed for model '") + Name() +
+           "': model does not support batching while non-zero max_batch_size"
+           " is specified").c_str());
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelState::AutoCompleteIO(const char* key, const OnnxTensorInfoMap& io_infos)
+{
+  triton::common::TritonJson::Value existing_ios;
+  bool found_ios = ModelConfig().Find(key, &existing_ios);
+
+  triton::common::TritonJson::Value ios(
+      ModelConfig(), triton::common::TritonJson::ValueType::ARRAY);
+  for (const auto& io_info : io_infos) {
+    triton::common::TritonJson::Value io(
+        ModelConfig(), triton::common::TritonJson::ValueType::OBJECT);
+    RETURN_IF_ERROR(io.AddString("name", io_info.first));
+    RETURN_IF_ERROR(io.AddString(
+        "data_type", OnnxDataTypeToModelConfigDataType(io_info.second.type_)));
+
+    // The model signature supports batching then the first dimension
+    // is -1 and should not appear in the model configuration 'dims'
+    // that we are creating.
+    const auto& io_info_dims = io_info.second.dims_;
+    triton::common::TritonJson::Value dims(
+        ModelConfig(), triton::common::TritonJson::ValueType::ARRAY);
+    for (size_t i = (MaxBatchSize() > 0) ? 1 : 0; i < io_info_dims.size();
+         ++i) {
+      RETURN_IF_ERROR(dims.AppendInt(io_info_dims[i]));
+    }
+
+    // If dims are empty then must use a reshape...
+    if (dims.ArraySize() == 0) {
+      RETURN_IF_ERROR(dims.AppendInt(1));
+      triton::common::TritonJson::Value reshape(
+          ModelConfig(), triton::common::TritonJson::ValueType::OBJECT);
+      triton::common::TritonJson::Value reshape_dims(
+          ModelConfig(), triton::common::TritonJson::ValueType::ARRAY);
+      RETURN_IF_ERROR(reshape.Add("shape", std::move(reshape_dims)));
+      RETURN_IF_ERROR(io.Add("reshape", std::move(reshape)));
+    }
+    RETURN_IF_ERROR(io.Add("dims", std::move(dims)));
+    RETURN_IF_ERROR(ios.Append(std::move(io)));
+  }
+
+  if (found_ios) {
+    existing_ios.Swap(ios);
+  } else {
+    ModelConfig().Add(key, std::move(ios));
+  }
+
+  return nullptr;  // success
+}
+#endif
+
+//
+// ModelInstanceState
+//
+// State associated with a model instance. An object of this class is
+// created and associated with each TRITONBACKEND_ModelInstance.
+//
+class ModelInstanceState : public BackendModelInstance {
+ public:
+  static TRITONSERVER_Error* Create(
+      ModelState* model_state,
+      TRITONBACKEND_ModelInstance* triton_model_instance,
+      ModelInstanceState** state);
+  virtual ~ModelInstanceState();
+
+  // Get the state of the model that corresponds to this instance.
+  ModelState* StateForModel() const { return model_state_; }
+
+#if 0
+  // Execute...
+  void ProcessRequests(
+      TRITONBACKEND_Request** requests, const uint32_t request_count);
+#endif
+
+ private:
+  ModelInstanceState(
+      ModelState* model_state,
+      TRITONBACKEND_ModelInstance* triton_model_instance);
+#if 0
+  void ReleaseOrtRunResources();
+  TRITONSERVER_Error* ValidateBooleanSequenceControl(
+      triton::common::TritonJson::Value& sequence_batching,
+      const std::string& control_kind, bool required, bool* have_control);
+  TRITONSERVER_Error* ValidateTypedSequenceControl(
+      triton::common::TritonJson::Value& sequence_batching,
+      const std::string& control_kind, bool required, bool* have_control);
+  TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt);
+  TRITONSERVER_Error* ValidateOutputs();
+  void OrtRun(
+      std::vector<TRITONBACKEND_Response*>* responses,
+      const uint32_t response_count,
+      const std::vector<const char*>& input_names,
+      const std::vector<const char*>& output_names);
+  void SetInputTensors(
+      size_t total_batch_size, TRITONBACKEND_Request** requests,
+      const uint32_t request_count,
+      std::vector<TRITONBACKEND_Response*>* responses,
+      BackendInputCollector* collector, std::vector<const char*>* input_names,
+      bool* cuda_copy);
+  void SetStringInputTensor(
+      TRITONBACKEND_Request** requests, const uint32_t request_count,
+      std::vector<TRITONBACKEND_Response*>* responses, const char* input_name,
+      std::vector<const char*>* string_ptrs, bool* cuda_copy);
+  void SetStringInputBuffer(
+      const std::string& name, const std::vector<size_t>& expected_byte_sizes,
+      const std::vector<size_t>& expected_element_cnts,
+      std::vector<TRITONBACKEND_Response*>* responses, char* input_buffer,
+      std::vector<const char*>* string_ptrs);
+  void FillStringData(std::vector<const char*>* string_ptrs, size_t cnt);
+  void ReadOutputTensors(
+      size_t total_batch_size, const std::vector<const char*>& output_names,
+      TRITONBACKEND_Request** requests, const uint32_t request_count,
+      std::vector<TRITONBACKEND_Response*>* responses);
+  bool SetStringOutputBuffer(
+      const std::string& name, const char* content, const size_t* offsets,
+      std::vector<int64_t>* batchn_shape, TRITONBACKEND_Request** requests,
+      const uint32_t request_count,
+      std::vector<TRITONBACKEND_Response*>* responses);
+#endif
+
+  ModelState* model_state_;
+
+  // The full path to the model file.
+  std::string model_path_;
+
+#if 0
+  // Onnx Runtime variables that are used across runs on this
+  // instance.
+  OrtSession* session_;
+  OrtAllocator* allocator_;
+
+  // Onnx Runtime variables that will be reset and used for every run
+  // on this instance.
+  std::vector<OrtValue*> input_tensors_;
+  std::vector<OrtValue*> output_tensors_;
+  std::vector<BackendMemory*> input_tensor_memories_;
+#endif
+};
+
+TRITONSERVER_Error*
+ModelInstanceState::Create(
+    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
+    ModelInstanceState** state)
+{
+  try {
+    *state = new ModelInstanceState(model_state, triton_model_instance);
+  }
+  catch (const BackendModelInstanceException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelInstanceException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
+  return nullptr;  // success
+}
+
+ModelInstanceState::ModelInstanceState(
+    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
+    : BackendModelInstance(model_state, triton_model_instance)
+{
+#if 0
+  THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(
+      ArtifactFilename(), Kind(), DeviceId(), &model_path_, &session_,
+      &allocator_));
+
+  size_t expected_input_cnt = 0;
+  {
+    triton::common::TritonJson::Value inputs;
+    if (model_state->ModelConfig().Find("input", &inputs)) {
+      expected_input_cnt = inputs.ArraySize();
+    }
+  }
+
+  // If this is a sequence model then make sure that the required
+  // inputs are present in the model and have the correct shape and
+  // datatype.
+  triton::common::TritonJson::Value sequence_batching;
+  if (model_state->ModelConfig().Find(
+          "sequence_batching", &sequence_batching)) {
+    bool have_start, have_end, have_ready, have_corrid;
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
+        sequence_batching, "CONTROL_SEQUENCE_START", false /* required */,
+        &have_start));
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
+        sequence_batching, "CONTROL_SEQUENCE_END", false /* required */,
+        &have_end));
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
+        sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */,
+        &have_ready));
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl(
+        sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */,
+        &have_corrid));
+    if (have_start) {
+      expected_input_cnt += 1;
+    }
+    if (have_end) {
+      expected_input_cnt += 1;
+    }
+    if (have_ready) {
+      expected_input_cnt += 1;
+    }
+    if (have_corrid) {
+      expected_input_cnt += 1;
+    }
+  }
+
+  THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt));
+  THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
+#endif
+}
+
+ModelInstanceState::~ModelInstanceState()
+{
+#if 0
+  ReleaseOrtRunResources();
+  if (session_ != nullptr) {
+    OnnxLoader::UnloadSession(session_);
+  }
+  // 'allocator_' is default allocator which is managed by ONNX Runtime
+#endif
+}
+
+#if 0
+void
+ModelInstanceState::ReleaseOrtRunResources()
+{
+  for (auto& tensor : input_tensors_) {
+    if (tensor != nullptr) {
+      ort_api->ReleaseValue(tensor);
+    }
+  }
+  input_tensors_.clear();
+
+  for (auto& tensor : output_tensors_) {
+    if (tensor != nullptr) {
+      ort_api->ReleaseValue(tensor);
+    }
+  }
+  output_tensors_.clear();
+
+  for (BackendMemory* mem : input_tensor_memories_) {
+    delete mem;
+  }
+  input_tensor_memories_.clear();
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ValidateBooleanSequenceControl(
+    triton::common::TritonJson::Value& sequence_batching,
+    const std::string& control_kind, bool required, bool* have_control)
+{
+  std::string tensor_name;
+  std::string tensor_datatype;
+  RETURN_IF_ERROR(GetBooleanSequenceControlProperties(
+      sequence_batching, model_state_->Name(), control_kind, required,
+      &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr));
+  *have_control = !tensor_name.empty();
+  if (*have_control) {
+    OnnxTensorInfoMap input_tensor_infos;
+    RETURN_IF_ERROR(InputInfos(session_, allocator_, input_tensor_infos));
+    const auto& iit = input_tensor_infos.find(tensor_name);
+    if (iit == input_tensor_infos.end()) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("configuration specified sequence control '") +
+           tensor_name + "', but model does not provide that input")
+              .c_str());
+    }
+
+    // Control tensors must have shape [1].
+    const int nonbatch_start_idx = (model_state_->MaxBatchSize() > 0) ? 1 : 0;
+    std::vector<int64_t> debatched_dims;
+    for (size_t i = nonbatch_start_idx; i < iit->second.dims_.size(); i++) {
+      debatched_dims.push_back(iit->second.dims_[i]);
+    }
+
+    if ((debatched_dims.size() != 1) || (debatched_dims[0] != 1)) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string("unable to load model '") + model_state_->Name() +
+           "', sequence control '" + tensor_name + "' in model has dims " +
+           ShapeToString(debatched_dims) + " but dims [1] is expected")
+              .c_str());
+    }
+
+    if (ModelConfigDataTypeToOnnxDataType(tensor_datatype) !=
+        iit->second.type_) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string("unable to load model '") + model_state_->Name() +
+           "', sequence control '" + tensor_name +
+           "', the model expects data-type " +
+           OnnxDataTypeName(iit->second.type_) +
+           " but the model configuration specifies data-type " +
+           tensor_datatype)
+              .c_str());
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ValidateTypedSequenceControl(
+    triton::common::TritonJson::Value& sequence_batching,
+    const std::string& control_kind, bool required, bool* have_control)
+{
+  std::string tensor_name;
+  std::string tensor_datatype;
+  RETURN_IF_ERROR(GetTypedSequenceControlProperties(
+      sequence_batching, model_state_->Name(), control_kind, required,
+      &tensor_name, &tensor_datatype));
+  *have_control = !tensor_name.empty();
+  if (*have_control) {
+    OnnxTensorInfoMap input_tensor_infos;
+    RETURN_IF_ERROR(InputInfos(session_, allocator_, input_tensor_infos));
+    const auto& iit = input_tensor_infos.find(tensor_name);
+    if (iit == input_tensor_infos.end()) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("configuration specified sequence control '") +
+           tensor_name + "', but model does not provide that input")
+              .c_str());
+    }
+
+    // Control tensors must have shape [1].
+    const int nonbatch_start_idx = (model_state_->MaxBatchSize() > 0) ? 1 : 0;
+    std::vector<int64_t> debatched_dims;
+    for (size_t i = nonbatch_start_idx; i < iit->second.dims_.size(); i++) {
+      debatched_dims.push_back(iit->second.dims_[i]);
+    }
+
+    if ((debatched_dims.size() != 1) || (debatched_dims[0] != 1)) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string("unable to load model '") + model_state_->Name() +
+           "', sequence control '" + tensor_name + "' in model has dims " +
+           ShapeToString(debatched_dims) + " but dims [1] is expected")
+              .c_str());
+    }
+
+    if (ModelConfigDataTypeToOnnxDataType(tensor_datatype) !=
+        iit->second.type_) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string("unable to load model '") + model_state_->Name() +
+           "', sequence control '" + tensor_name +
+           "', the model expects data-type " +
+           OnnxDataTypeName(iit->second.type_) +
+           " but the model configuration specifies data-type " +
+           tensor_datatype)
+              .c_str());
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
+{
+  std::set<std::string> input_tensor_names;
+  RETURN_IF_ERROR(InputNames(session_, input_tensor_names));
+
+  OnnxTensorInfoMap input_tensor_infos;
+  RETURN_IF_ERROR(InputInfos(session_, allocator_, input_tensor_infos));
+
+  if (input_tensor_infos.size() != expected_input_cnt) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (std::string("unable to load model '") + model_state_->Name() +
+         "', configuration expects " + std::to_string(expected_input_cnt) +
+         " inputs, model provides " + std::to_string(input_tensor_infos.size()))
+            .c_str());
+  }
+
+  triton::common::TritonJson::Value ios;
+  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios));
+  for (size_t i = 0; i < ios.ArraySize(); i++) {
+    triton::common::TritonJson::Value io;
+    RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+    std::string io_name;
+    RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+    std::string io_dtype;
+    RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
+
+    auto iit = input_tensor_infos.find(io_name);
+    if (iit == input_tensor_infos.end()) {
+      RETURN_IF_ERROR(CheckAllowedModelInput(io, input_tensor_names));
+    }
+
+    auto onnx_data_type = ModelConfigDataTypeToOnnxDataType(io_dtype);
+    if (onnx_data_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unsupported datatype ") + io_dtype + " for input '" +
+           io_name + "' for model '" + model_state_->Name() + "'")
+              .c_str());
+    } else if (onnx_data_type != iit->second.type_) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string("unable to load model '") + model_state_->Name() +
+           ", unexpected datatype " +
+           TRITONSERVER_DataTypeString(
+               ConvertFromOnnxDataType(iit->second.type_)) +
+           " for input '" + io_name + "', expecting " + io_dtype)
+              .c_str());
+    }
+
+    // If a reshape is provided for the input then use that when
+    // validating that the model matches what is expected.
+    std::vector<int64_t> dims;
+    triton::common::TritonJson::Value reshape;
+    if (io.Find("reshape", &reshape)) {
+      RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
+    } else {
+      RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
+    }
+    RETURN_IF_ERROR(CompareDimsSupported(
+        model_state_->Name(), io_name, iit->second.dims_, dims,
+        model_state_->MaxBatchSize(), false /* compare_exact */));
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ValidateOutputs()
+{
+  std::set<std::string> output_tensor_names;
+  RETURN_IF_ERROR(OutputNames(session_, output_tensor_names));
+
+  OnnxTensorInfoMap output_tensor_infos;
+  RETURN_IF_ERROR(OutputInfos(session_, allocator_, output_tensor_infos));
+
+  triton::common::TritonJson::Value ios;
+  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios));
+  for (size_t i = 0; i < ios.ArraySize(); i++) {
+    triton::common::TritonJson::Value io;
+    RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+    std::string io_name;
+    RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+    std::string io_dtype;
+    RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
+
+    auto iit = output_tensor_infos.find(io_name);
+    if (iit == output_tensor_infos.end()) {
+      RETURN_IF_ERROR(CheckAllowedModelOutput(io, output_tensor_names));
+    }
+
+    auto onnx_data_type = ModelConfigDataTypeToOnnxDataType(io_dtype);
+    if (onnx_data_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unsupported datatype ") + io_dtype + " for output '" +
+           io_name + "' for model '" + model_state_->Name() + "'")
+              .c_str());
+    } else if (onnx_data_type != iit->second.type_) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string("unable to load model '") + model_state_->Name() +
+           ", unexpected datatype " +
+           TRITONSERVER_DataTypeString(
+               ConvertFromOnnxDataType(iit->second.type_)) +
+           " for output '" + io_name + "', expecting " + io_dtype)
+              .c_str());
+    }
+
+    // If a reshape is provided for the input then use that when
+    // validating that the model matches what is expected.
+    std::vector<int64_t> dims;
+    triton::common::TritonJson::Value reshape;
+    if (io.Find("reshape", &reshape)) {
+      RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
+    } else {
+      RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
+    }
+    RETURN_IF_ERROR(CompareDimsSupported(
+        model_state_->Name(), io_name, iit->second.dims_, dims,
+        model_state_->MaxBatchSize(), true /* compare_exact */));
+  }
+
+  return nullptr;  // success
+}
+
+void
+ModelInstanceState::ProcessRequests(
+    TRITONBACKEND_Request** requests, const uint32_t request_count)
+{
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " +
+       std::to_string(request_count) + " requests")
+          .c_str());
+
+  uint64_t exec_start_ns = 0;
+  SET_TIMESTAMP(exec_start_ns);
+
+  const int max_batch_size = model_state_->MaxBatchSize();
+
+  // For each request collect the total batch size for this inference
+  // execution. The batch-size, number of inputs, and size of each
+  // input has already been checked so don't need to do that here.
+  size_t total_batch_size = 0;
+  for (size_t i = 0; i < request_count; i++) {
+    // If we get a nullptr request then something is badly wrong. Fail
+    // and release all requests.
+    if (requests[i] == nullptr) {
+      RequestsRespondWithError(
+          requests, request_count,
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              std::string(
+                  "null request given to ONNX Runtime backend for '" + Name() +
+                  "'")
+                  .c_str()));
+      return;
+    }
+
+    if (max_batch_size > 0) {
+      // Retrieve the batch size from one of the inputs, if the model
+      // supports batching, the first dimension size is batch size
+      TRITONBACKEND_Input* input;
+      TRITONSERVER_Error* err =
+          TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
+      if (err == nullptr) {
+        const int64_t* shape;
+        err = TRITONBACKEND_InputProperties(
+            input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+        total_batch_size += shape[0];
+      }
+      if (err != nullptr) {
+        RequestsRespondWithError(requests, request_count, err);
+        return;
+      }
+    } else {
+      total_batch_size += 1;
+    }
+  }
+
+  // If there are no valid payloads then no need to run the inference.
+  if (total_batch_size == 0) {
+    return;
+  }
+
+  // Make sure the maximum batch size is not exceeded. The
+  // total_batch_size must be 1 for models that don't support batching
+  // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
+  // scheduler has done something badly wrong so fail and release all
+  // requests.
+  if ((total_batch_size != 1) && (total_batch_size > (size_t)max_batch_size)) {
+    RequestsRespondWithError(
+        requests, request_count,
+        TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            std::string(
+                "batch size " + std::to_string(total_batch_size) + " for '" +
+                Name() + "', max allowed is " + std::to_string(max_batch_size))
+                .c_str()));
+    return;
+  }
+
+  // At this point we are committed to running inference with all
+  // 'requests'. Create a response for each request. During input
+  // processing if there is an error with any request that error will
+  // be sent immediately with the corresponding response (and the
+  // response unique_ptr will then be nullptr). The request object
+  // itself will not be released until after all inferencing is done
+  // (below) as we may need to access the request object when
+  // determine how to process outputs (for example, even if we don't
+  // need the outputs for a request that has an error, we do need to
+  // know the size of those outputs associated with the request so we
+  // can skip them in the output tensors).
+  std::vector<TRITONBACKEND_Response*> responses;
+  responses.reserve(request_count);
+
+  for (size_t i = 0; i < request_count; i++) {
+    TRITONBACKEND_Response* response;
+    auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
+    if (err == nullptr) {
+      responses.emplace_back(response);
+    } else {
+      responses.emplace_back(nullptr);
+      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
+      TRITONSERVER_ErrorDelete(err);
+    }
+  }
+
+  // Use scoped class to clean up ORT tensors and other resources that
+  // need to persist until ORT run completes.
+  struct ScopedCleanup {
+    ScopedCleanup(ModelInstanceState* ctx) : ctx_(ctx) {}
+    ~ScopedCleanup()
+    {
+      if (ctx_ != nullptr) {
+        ctx_->ReleaseOrtRunResources();
+      }
+    }
+    ModelInstanceState* ctx_;
+  } io_tensor_wrapper(this);
+
+  std::vector<const char*> input_names;
+  bool cuda_copy = false;
+  BackendInputCollector collector(
+      requests, request_count, &responses, model_state_->TritonMemoryManager(),
+      model_state_->EnablePinnedInput(), CudaStream());
+  SetInputTensors(
+      total_batch_size, requests, request_count, &responses, &collector,
+      &input_names, &cuda_copy);
+
+  // Request to retrieve all model outputs. 'output_names' and
+  // 'output_tensors_' are parallel vectors and so must be kept in
+  // sync. [TODO] should collect only the outputs needed by some
+  // request.
+  std::vector<const char*> output_names;
+  {
+    triton::common::TritonJson::Value ios;
+    TRITONSERVER_Error* err =
+        model_state_->ModelConfig().MemberAsArray("output", &ios);
+    if (err == nullptr) {
+      for (size_t i = 0; i < ios.ArraySize(); i++) {
+        triton::common::TritonJson::Value io;
+        err = ios.IndexAsObject(i, &io);
+        if (err != nullptr) {
+          break;
+        }
+
+        // Use names from ModelConfig by reference since the model
+        // config will persist longer than this inference execution.
+        const char* io_name;
+        size_t io_name_len;
+        err = io.MemberAsString("name", &io_name, &io_name_len);
+        if (err != nullptr) {
+          break;
+        }
+
+        output_names.emplace_back(io_name);
+        output_tensors_.emplace_back(nullptr);
+      }
+    }
+
+    if (err != nullptr) {
+      SendErrorForResponses(&responses, request_count, err);
+      output_names.clear();
+    }
+  }
+
+  uint64_t compute_start_ns = 0;
+  SET_TIMESTAMP(compute_start_ns);
+
+  // Run...
+  OrtRun(&responses, request_count, input_names, output_names);
+
+  uint64_t compute_end_ns = 0;
+  SET_TIMESTAMP(compute_end_ns);
+
+  ReadOutputTensors(
+      total_batch_size, output_names, requests, request_count, &responses);
+
+  uint64_t exec_end_ns = 0;
+  SET_TIMESTAMP(exec_end_ns);
+
+  // Send all the responses that haven't already been sent because of
+  // an earlier error. Note that the responses are not set to nullptr
+  // here as we need that indication below to determine if the request
+  // we successful or not.
+  for (auto& response : responses) {
+    if (response != nullptr) {
+      LOG_IF_ERROR(
+          TRITONBACKEND_ResponseSend(
+              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
+          "failed to send onnxruntime backend response");
+    }
+  }
+
+  // Report statistics for each request.
+  for (uint32_t r = 0; r < request_count; ++r) {
+    auto& request = requests[r];
+    LOG_IF_ERROR(
+        TRITONBACKEND_ModelInstanceReportStatistics(
+            TritonModelInstance(), request,
+            (responses[r] != nullptr) /* success */, exec_start_ns,
+            compute_start_ns, compute_end_ns, exec_end_ns),
+        "failed reporting request statistics");
+
+    LOG_IF_ERROR(
+        TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
+        "failed releasing request");
+  }
+
+  // Report the entire batch statistics.
+  LOG_IF_ERROR(
+      TRITONBACKEND_ModelInstanceReportBatchStatistics(
+          TritonModelInstance(), total_batch_size, exec_start_ns,
+          compute_start_ns, compute_end_ns, exec_end_ns),
+      "failed reporting batch request statistics");
+}
+
+void
+ModelInstanceState::OrtRun(
+    std::vector<TRITONBACKEND_Response*>* responses,
+    const uint32_t response_count, const std::vector<const char*>& input_names,
+    const std::vector<const char*>& output_names)
+{
+  OrtStatus* status = ort_api->Run(
+      session_, NULL /* run options */, input_names.data(),
+      (const OrtValue* const*)input_tensors_.data(), input_tensors_.size(),
+      output_names.data(), output_names.size(), output_tensors_.data());
+
+  if (status != nullptr) {
+    OrtErrorCode code = ort_api->GetErrorCode(status);
+    std::string msg = ort_api->GetErrorMessage(status);
+    SendErrorForResponses(
+        responses, response_count,
+        TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            (std::string("openvino execute failure ") +
+             std::to_string(code) + ": " + msg)
+                .c_str()));
+  }
+}
+
+void
+ModelInstanceState::SetInputTensors(
+    size_t total_batch_size, TRITONBACKEND_Request** requests,
+    const uint32_t request_count,
+    std::vector<TRITONBACKEND_Response*>* responses,
+    BackendInputCollector* collector, std::vector<const char*>* input_names,
+    bool* cuda_copy)
+{
+  const int max_batch_size = model_state_->MaxBatchSize();
+
+  // All requests must have equally-sized input tensors so use any
+  // request as the representative for the input tensors.
+  uint32_t input_count;
+  RESPOND_ALL_AND_RETURN_IF_ERROR(
+      responses, request_count,
+      TRITONBACKEND_RequestInputCount(requests[0], &input_count));
+  for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
+    TRITONBACKEND_Input* input;
+    RESPOND_ALL_AND_RETURN_IF_ERROR(
+        responses, request_count,
+        TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
+
+    const char* input_name;
+    TRITONSERVER_DataType input_datatype;
+    const int64_t* input_shape;
+    uint32_t input_dims_count;
+    RESPOND_ALL_AND_RETURN_IF_ERROR(
+        responses, request_count,
+        TRITONBACKEND_InputProperties(
+            input, &input_name, &input_datatype, &input_shape,
+            &input_dims_count, nullptr, nullptr));
+
+    input_names->emplace_back(input_name);
+    input_tensors_.emplace_back(nullptr);
+
+    // The shape for the entire input patch, [total_batch_size, ...]
+    std::vector<int64_t> batchn_shape(
+        input_shape, input_shape + input_dims_count);
+    if (max_batch_size != 0) {
+      batchn_shape[0] = total_batch_size;
+    }
+
+    // [TODO] currently ONNX Runtime only recognize input data on CPU
+    // https://github.com/microsoft/onnxruntime/issues/1621
+    if (input_datatype != TRITONSERVER_TYPE_BYTES) {
+      // The input must be in contiguous CPU memory. Use a pinned
+      // memory if possible for the case where the inputs are being
+      // provided in GPU memory.
+      //
+      // [TODO] a couple of optimizations are possible here. 1) if we
+      // know that all data for this input across all requests was not
+      // in GPU memory, then we could just use regular CPU memory and
+      // not pinned memory. 2) if there is a single request and for
+      // this input the data is already in contiguous CPU memory then
+      // we don't need to copy at all.
+      const int64_t batchn_byte_size =
+          GetByteSize(input_datatype, batchn_shape);
+
+      BackendMemory* input_memory;
+      RESPOND_ALL_AND_RETURN_IF_ERROR(
+          responses, request_count,
+          BackendMemory::Create(
+              model_state_->TritonMemoryManager(),
+              {BackendMemory::AllocationType::CPU_PINNED_POOL,
+               BackendMemory::AllocationType::CPU},
+              0 /* memory_type_id */, batchn_byte_size, &input_memory));
+      input_tensor_memories_.push_back(input_memory);
+
+      TRITONSERVER_MemoryType input_memtype = input_memory->MemoryType();
+      char* input_buffer = input_memory->MemoryPtr();
+
+      // Create ORT Tensor
+      const OrtMemoryInfo* allocator_info;
+      RESPOND_ALL_AND_RETURN_IF_ORT_ERROR(
+          responses, request_count,
+          ort_api->AllocatorGetInfo(allocator_, &allocator_info));
+      RESPOND_ALL_AND_RETURN_IF_ORT_ERROR(
+          responses, request_count,
+          ort_api->CreateTensorWithDataAsOrtValue(
+              allocator_info, (void*)input_buffer, batchn_byte_size,
+              batchn_shape.data(), batchn_shape.size(),
+              ConvertToOnnxDataType(input_datatype), &input_tensors_.back()));
+
+      collector->ProcessTensor(
+          input_name, input_buffer, batchn_byte_size, input_memtype, 0);
+    } else {
+      // For BYTES input, we need to convert the serialized string
+      // representation into what is required for ORT. ORT expects a
+      // vector of char*, one for each element. For each tensor we get
+      // a copy of the data in a contiguous CPU buffer and then
+      // in-place modify that from the Triton
+      // <int32_len><bytes><int32_len><bytes>... serialization into a
+      // <bytes><null-terminator><bytes><null-terminator>... serialization
+      // and then initialize 'string_ptrs' to point to each <bytes>.
+      std::vector<const char*> string_ptrs;
+
+      SetStringInputTensor(
+          requests, request_count, responses, input_name, &string_ptrs,
+          cuda_copy);
+
+      RESPOND_ALL_AND_RETURN_IF_ORT_ERROR(
+          responses, request_count,
+          ort_api->CreateTensorAsOrtValue(
+              allocator_, batchn_shape.data(), batchn_shape.size(),
+              ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING, &input_tensors_.back()));
+      RESPOND_ALL_AND_RETURN_IF_ORT_ERROR(
+          responses, request_count,
+          ort_api->FillStringTensor(
+              input_tensors_.back(), string_ptrs.data(), string_ptrs.size()));
+    }
+  }
+
+  // Finalize...
+  *cuda_copy |= collector->Finalize();
+}
+
+void
+ModelInstanceState::SetStringInputTensor(
+    TRITONBACKEND_Request** requests, const uint32_t request_count,
+    std::vector<TRITONBACKEND_Response*>* responses, const char* input_name,
+    std::vector<const char*>* string_ptrs, bool* cuda_copy)
+{
+  size_t total_byte_size = 0;
+  std::vector<size_t> expected_byte_sizes;
+  std::vector<size_t> expected_element_cnts;
+  expected_byte_sizes.reserve(request_count);
+  expected_element_cnts.reserve(request_count);
+  for (size_t ridx = 0; ridx < request_count; ++ridx) {
+    TRITONBACKEND_Input* in;
+    RESPOND_AND_SET_NULL_IF_ERROR(
+        &((*responses)[ridx]),
+        TRITONBACKEND_RequestInput(requests[ridx], input_name, &in));
+
+    const int64_t* input_shape;
+    uint32_t input_dims_count;
+    uint64_t input_byte_size;
+    RESPOND_ALL_AND_RETURN_IF_ERROR(
+        responses, request_count,
+        TRITONBACKEND_InputProperties(
+            in, nullptr, nullptr, &input_shape, &input_dims_count,
+            &input_byte_size, nullptr));
+
+    // Skip input in this request if error response has already been sent.
+    if ((*responses)[ridx] == nullptr) {
+      expected_byte_sizes.push_back(0);
+      expected_element_cnts.push_back(0);
+    } else {
+      expected_element_cnts.push_back(
+          GetElementCount(input_shape, input_dims_count));
+      expected_byte_sizes.push_back(input_byte_size);
+    }
+
+    total_byte_size += expected_byte_sizes.back();
+  }
+
+  // For string input, the copy to contiguous buffer is needed because ORT
+  // expects elements to be C strings thus we need to modify input buffer.
+  // Reserve one more byte at the end of input_buffer to ensure last
+  // element of String data can become valid C string.
+  BackendMemory* input_memory;
+  RESPOND_ALL_AND_RETURN_IF_ERROR(
+      responses, request_count,
+      BackendMemory::Create(
+          model_state_->TritonMemoryManager(),
+          {BackendMemory::AllocationType::CPU_PINNED_POOL,
+           BackendMemory::AllocationType::CPU},
+          0 /* memory_type_id */, total_byte_size + 1, &input_memory));
+  input_tensor_memories_.push_back(input_memory);
+
+  const TRITONSERVER_MemoryType mem_type = input_memory->MemoryType();
+  char* input_buffer = input_memory->MemoryPtr();
+
+  size_t buffer_offset = 0;
+  for (size_t ridx = 0; ridx < request_count; ++ridx) {
+    TRITONBACKEND_Input* in;
+    TRITONSERVER_Error* err =
+        TRITONBACKEND_RequestInput(requests[ridx], input_name, &in);
+    if ((err == nullptr) && ((*responses)[ridx] != nullptr)) {
+      uint32_t input_buffer_count;
+      RESPOND_ALL_AND_RETURN_IF_ERROR(
+          responses, request_count,
+          TRITONBACKEND_InputProperties(
+              in, nullptr, nullptr, nullptr, nullptr, nullptr,
+              &input_buffer_count));
+
+      size_t input_offset = 0;
+      for (size_t idx = 0; idx < input_buffer_count; ++idx) {
+        const void* src_buffer;
+        size_t src_byte_size;
+        TRITONSERVER_MemoryType src_memory_type;
+        int64_t src_memory_type_id;
+        err = TRITONBACKEND_InputBuffer(
+            in, idx, &src_buffer, &src_byte_size, &src_memory_type,
+            &src_memory_type_id);
+        if (err == nullptr) {
+          if ((input_offset + src_byte_size) > expected_byte_sizes[ridx]) {
+            err = TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_INVALID_ARG,
+                (std::string("buffer size for input '") + input_name +
+                 "' exceeds batch byte size " +
+                 std::to_string(expected_byte_sizes[ridx]))
+                    .c_str());
+          } else {
+            bool cuda_used = false;
+            err = CopyBuffer(
+                input_name, src_memory_type, src_memory_type_id, mem_type, 0,
+                src_byte_size, src_buffer,
+                input_buffer + buffer_offset + input_offset, CudaStream(),
+                &cuda_used);
+            *cuda_copy |= cuda_used;
+          }
+        }
+
+        if (err == nullptr) {
+          input_offset += src_byte_size;
+        } else {
+          break;
+        }
+      }
+    }
+
+    if (err != nullptr) {
+      if ((*responses)[ridx] != nullptr) {
+        RESPOND_AND_SET_NULL_IF_ERROR(&((*responses)[ridx]), err);
+      }
+
+      TRITONSERVER_ErrorDelete(err);
+    }
+
+    buffer_offset += expected_byte_sizes[ridx];
+  }
+
+  // Modify input buffer and set string expected by ORT
+  SetStringInputBuffer(
+      input_name, expected_byte_sizes, expected_element_cnts, responses,
+      input_buffer, string_ptrs);
+  input_buffer[total_byte_size] = 0;
+}
+
+void
+ModelInstanceState::SetStringInputBuffer(
+    const std::string& input_name,
+    const std::vector<size_t>& expected_byte_sizes,
+    const std::vector<size_t>& expected_element_cnts,
+    std::vector<TRITONBACKEND_Response*>* responses, char* input_buffer,
+    std::vector<const char*>* string_ptrs)
+{
+  // offset for each response
+  size_t buffer_copy_offset = 0;
+  for (size_t idx = 0; idx < expected_byte_sizes.size(); idx++) {
+    const size_t expected_byte_size = expected_byte_sizes[idx];
+    const size_t expected_element_cnt = expected_element_cnts[idx];
+
+    size_t element_cnt = 0;
+    if ((*responses)[idx] != nullptr) {
+      size_t remaining_bytes = expected_byte_size;
+      char* data_content = input_buffer + buffer_copy_offset;
+      // Continue if the remaining bytes may still contain size info
+      while (remaining_bytes >= sizeof(uint32_t)) {
+        if (element_cnt >= expected_element_cnt) {
+          RESPOND_AND_SET_NULL_IF_ERROR(
+              &((*responses)[idx]),
+              TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ERROR_INVALID_ARG,
+                  (std::string("unexpected number of string elements ") +
+                   std::to_string(element_cnt + 1) + " for inference input '" +
+                   input_name + "', expecting " +
+                   std::to_string(expected_element_cnt))
+                      .c_str()));
+          break;
+        }
+
+        const uint32_t len = *(reinterpret_cast<const uint32_t*>(data_content));
+        remaining_bytes -= sizeof(uint32_t);
+        // Make first byte of size info 0, so that if there is string data
+        // in front of it, the data becomes valid C string.
+        *data_content = 0;
+        data_content = data_content + sizeof(uint32_t);
+        if (len > remaining_bytes) {
+          RESPOND_AND_SET_NULL_IF_ERROR(
+              &((*responses)[idx]),
+              TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ERROR_INVALID_ARG,
+                  (std::string("incomplete string data for inference input '") +
+                   input_name + "', expecting string of length " +
+                   std::to_string(len) + " but only " +
+                   std::to_string(remaining_bytes) + " bytes available")
+                      .c_str()));
+          break;
+        } else {
+          string_ptrs->push_back(data_content);
+          element_cnt++;
+          data_content = data_content + len;
+          remaining_bytes -= len;
+        }
+      }
+    }
+
+    FillStringData(string_ptrs, expected_element_cnt - element_cnt);
+    buffer_copy_offset += expected_byte_size;
+  }
+}
+
+void
+ModelInstanceState::FillStringData(
+    std::vector<const char*>* string_ptrs, size_t cnt)
+{
+  static const char* empty = "";
+  for (size_t c = 0; c < cnt; c++) {
+    string_ptrs->push_back(empty);
+  }
+}
+
+void
+ModelInstanceState::ReadOutputTensors(
+    size_t total_batch_size, const std::vector<const char*>& output_names,
+    TRITONBACKEND_Request** requests, const uint32_t request_count,
+    std::vector<TRITONBACKEND_Response*>* responses)
+{
+  BackendOutputResponder responder(
+      requests, request_count, responses, model_state_->MaxBatchSize(),
+      model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(),
+      CudaStream());
+
+  // Use to hold string output contents
+  bool cuda_copy = false;
+  std::vector<std::vector<char>> string_buffers;
+  for (size_t idx = 0; idx < output_names.size(); idx++) {
+    std::string name = output_names[idx];
+
+    OrtValue* output_tensor = output_tensors_[idx];
+    if (output_tensor == nullptr) {
+      RESPOND_ALL_AND_RETURN_IF_ERROR(
+          responses, request_count,
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              (std::string("output tensor '") + name + "' is not found")
+                  .c_str()));
+    }
+
+    // Get output type and shape
+    OrtTypeInfo* typeinfo;
+    RESPOND_ALL_AND_RETURN_IF_ORT_ERROR(
+        responses, request_count,
+        ort_api->GetTypeInfo(output_tensor, &typeinfo));
+    std::unique_ptr<OrtTypeInfo, TypeInfoDeleter> typeinfo_wrapper(typeinfo);
+
+    const OrtTensorTypeAndShapeInfo* type_and_shape;
+    RESPOND_ALL_AND_RETURN_IF_ORT_ERROR(
+        responses, request_count,
+        ort_api->CastTypeInfoToTensorInfo(typeinfo, &type_and_shape));
+
+    size_t num_dims;
+    RESPOND_ALL_AND_RETURN_IF_ORT_ERROR(
+        responses, request_count,
+        ort_api->GetDimensionsCount(type_and_shape, &num_dims));
+
+    std::vector<int64_t> batchn_shape(num_dims);
+    RESPOND_ALL_AND_RETURN_IF_ORT_ERROR(
+        responses, request_count,
+        ort_api->GetDimensions(
+            type_and_shape, batchn_shape.data(), batchn_shape.size()));
+
+    ONNXTensorElementDataType type;
+    RESPOND_ALL_AND_RETURN_IF_ORT_ERROR(
+        responses, request_count,
+        ort_api->GetTensorElementType(type_and_shape, &type));
+
+    if (type == ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) {
+      const size_t element_count = GetElementCount(batchn_shape);
+      size_t total_length = 0;
+      RESPOND_ALL_AND_RETURN_IF_ORT_ERROR(
+          responses, request_count,
+          ort_api->GetStringTensorDataLength(output_tensor, &total_length));
+
+      string_buffers.emplace_back(std::vector<char>(total_length));
+      auto content = string_buffers.back().data();
+      std::vector<size_t> offsets(element_count + 1);
+      RESPOND_ALL_AND_RETURN_IF_ORT_ERROR(
+          responses, request_count,
+          ort_api->GetStringTensorContent(
+              output_tensor, content, total_length, offsets.data(),
+              element_count));
+      // Mark "passed end byte offset"
+      offsets[element_count] = total_length;
+
+      cuda_copy |= SetStringOutputBuffer(
+          name, content, offsets.data(), &batchn_shape, requests, request_count,
+          responses);
+    } else {
+      // Fixed size data type...
+      char* output_buffer = nullptr;
+      RESPOND_ALL_AND_RETURN_IF_ORT_ERROR(
+          responses, request_count,
+          ort_api->GetTensorMutableData(output_tensor, (void**)&output_buffer));
+
+      // [TODO] currently ONNX output data are always on CPU
+      // https://github.com/microsoft/onnxruntime/issues/1621
+      responder.ProcessTensor(
+          name, ConvertFromOnnxDataType(type), batchn_shape, output_buffer,
+          TRITONSERVER_MEMORY_CPU, 0);
+    }
+  }
+
+  // Finalize and wait for any pending buffer copies.
+  cuda_copy |= responder.Finalize();
+
+}
+
+bool
+ModelInstanceState::SetStringOutputBuffer(
+    const std::string& name, const char* content, const size_t* offsets,
+    std::vector<int64_t>* batchn_shape, TRITONBACKEND_Request** requests,
+    const uint32_t request_count,
+    std::vector<TRITONBACKEND_Response*>* responses)
+{
+  size_t element_idx = 0;
+  bool cuda_copy = false;
+  for (size_t ridx = 0; ridx < request_count; ++ridx) {
+    const auto& request = requests[ridx];
+    auto& response = (*responses)[ridx];
+
+    // batchn_shape holds the shape of the entire tensor batch. When
+    // batching is enabled override the first batch dimension with each
+    // requests batch size (reusing for efficiency).
+    if (model_state_->MaxBatchSize() > 0) {
+      TRITONBACKEND_Input* input;
+      TRITONBACKEND_RequestInputByIndex(request, 0 /* index */, &input);
+      const int64_t* shape;
+      TRITONBACKEND_InputProperties(
+          input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+      (*batchn_shape)[0] = shape[0];
+    }
+
+    const size_t expected_element_cnt = GetElementCount(*batchn_shape);
+
+    // If 'request' requested this output then copy it from
+    // 'content'. If it did not request this output then just skip it
+    // in the 'content'.
+    bool need_output = false;
+    if (response != nullptr) {
+      uint32_t output_count;
+      RESPOND_AND_SET_NULL_IF_ERROR(
+          &response, TRITONBACKEND_RequestOutputCount(request, &output_count));
+      if (response != nullptr) {
+        for (uint32_t output_idx = 0; output_idx < output_count; output_idx++) {
+          const char* req_output_name;
+          RESPOND_AND_SET_NULL_IF_ERROR(
+              &response, TRITONBACKEND_RequestOutputName(
+                             request, output_idx, &req_output_name));
+          if ((response != nullptr) && (req_output_name == name)) {
+            need_output = true;
+            break;
+          }
+        }
+      }
+    }
+
+    if (need_output) {
+      TRITONBACKEND_Output* response_output;
+      TRITONSERVER_Error* err = TRITONBACKEND_ResponseOutput(
+          response, &response_output, name.c_str(), TRITONSERVER_TYPE_BYTES,
+          batchn_shape->data(), batchn_shape->size());
+      if (err == nullptr) {
+        // Calculate expected byte size in advance using string offsets
+        const size_t data_byte_size =
+            offsets[element_idx + expected_element_cnt] - offsets[element_idx];
+        const size_t expected_byte_size =
+            data_byte_size + sizeof(uint32_t) * expected_element_cnt;
+
+        TRITONSERVER_MemoryType actual_memory_type =
+            TRITONSERVER_MEMORY_CPU_PINNED;
+        int64_t actual_memory_type_id = 0;
+        void* buffer;
+        err = TRITONBACKEND_OutputBuffer(
+            response_output, &buffer, expected_byte_size, &actual_memory_type,
+            &actual_memory_type_id);
+        if (err == nullptr) {
+          bool cuda_used = false;
+          size_t copied_byte_size = 0;
+          for (size_t e = 0; e < expected_element_cnt; ++e) {
+            const uint32_t len =
+                offsets[element_idx + e + 1] - offsets[element_idx + e];
+            // Prepend size of the string
+            err = CopyBuffer(
+                name, TRITONSERVER_MEMORY_CPU /* src_memory_type */,
+                0 /* src_memory_type_id */, actual_memory_type,
+                actual_memory_type_id, sizeof(uint32_t),
+                static_cast<const void*>(&len),
+                static_cast<char*>(buffer) + copied_byte_size, stream_,
+                &cuda_used);
+            if (err != nullptr) {
+              break;
+            }
+
+            cuda_copy |= cuda_used;
+            copied_byte_size += sizeof(uint32_t);
+
+            // Copy raw string content
+            err = CopyBuffer(
+                name, TRITONSERVER_MEMORY_CPU /* src_memory_type */,
+                0 /* src_memory_type_id */, actual_memory_type,
+                actual_memory_type_id, len, content + offsets[element_idx + e],
+                static_cast<char*>(buffer) + copied_byte_size, stream_,
+                &cuda_used);
+            if (err != nullptr) {
+              break;
+            }
+
+            cuda_copy |= cuda_used;
+            copied_byte_size += len;
+          }
+        }
+      }
+
+      RESPOND_AND_SET_NULL_IF_ERROR(&response, err);
+    }
+
+    element_idx += expected_element_cnt;
+  }
+
+  return cuda_copy;
+}
+#endif
+
+/////////////
+
+extern "C" {
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
+{
+  const char* cname;
+  RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
+  std::string name(cname);
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("TRITONBACKEND_Initialize: ") + name).c_str());
+
+  // Check the backend API version that Triton supports vs. what this
+  // backend was compiled against.
+  uint32_t api_version_major, api_version_minor;
+  RETURN_IF_ERROR(
+      TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("Triton TRITONBACKEND API version: ") +
+       std::to_string(api_version_major) + "." +
+       std::to_string(api_version_minor))
+          .c_str());
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("'") + name + "' TRITONBACKEND API version: " +
+       std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
+       std::to_string(TRITONBACKEND_API_VERSION_MINOR))
+          .c_str());
+
+  if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
+      (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_UNSUPPORTED,
+        (std::string("Triton TRITONBACKEND API version: ") +
+         std::to_string(api_version_major) + "." +
+         std::to_string(api_version_minor) + " does not support '" + name +
+         "' TRITONBACKEND API version: " +
+         std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
+         std::to_string(TRITONBACKEND_API_VERSION_MINOR))
+            .c_str());
+  }
+
+  return nullptr;  // success
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend)
+{
+  return nullptr;  // success
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
+{
+  const char* cname;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
+  std::string name(cname);
+
+  uint64_t version;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " +
+       std::to_string(version) + ")")
+          .c_str());
+
+  // Create a ModelState object and associate it with the
+  // TRITONBACKEND_Model.
+  ModelState* model_state;
+  RETURN_IF_ERROR(ModelState::Create(model, &model_state));
+  RETURN_IF_ERROR(
+      TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
+
+  return nullptr;  // success
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
+{
+  void* vstate;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
+  ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state");
+
+  delete model_state;
+
+  return nullptr;  // success
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
+{
+  const char* cname;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));
+  std::string name(cname);
+
+  int32_t device_id;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id));
+  TRITONSERVER_InstanceGroupKind kind;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind));
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" +
+       TRITONSERVER_InstanceGroupKindString(kind) + " device " +
+       std::to_string(device_id) + ")")
+          .c_str());
+
+  // Get the model state associated with this instance's model.
+  TRITONBACKEND_Model* model;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
+
+  void* vmodelstate;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
+  ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
+
+  // Create a ModelInstanceState object and associate it with the
+  // TRITONBACKEND_ModelInstance.
+  ModelInstanceState* instance_state;
+  RETURN_IF_ERROR(
+      ModelInstanceState::Create(model_state, instance, &instance_state));
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
+      instance, reinterpret_cast<void*>(instance_state)));
+
+  return nullptr;  // success
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
+{
+  void* vstate;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
+  ModelInstanceState* instance_state =
+      reinterpret_cast<ModelInstanceState*>(vstate);
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
+
+  delete instance_state;
+
+  return nullptr;  // success
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceExecute(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
+    const uint32_t request_count)
+{
+  // Triton will not call this function simultaneously for the same
+  // 'instance'. But since this backend could be used by multiple
+  // instances from multiple models the implementation needs to handle
+  // multiple calls to this function at the same time (with different
+  // 'instance' objects). Suggested practice for this is to use only
+  // function-local and model-instance-specific state (obtained from
+  // 'instance'), which is what we do here.
+  ModelInstanceState* instance_state;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
+      instance, reinterpret_cast<void**>(&instance_state)));
+  ModelState* model_state = instance_state->StateForModel();
+
+  // This backend specifies BLOCKING execution policy. That means that
+  // we should not return from this function until execution is
+  // complete. Triton will automatically release 'instance' on return
+  // from this function so that it is again available to be used for
+  // another call to TRITONBACKEND_ModelInstanceExecute.
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("model ") + model_state->Name() + ", instance " +
+       instance_state->Name() + ", executing " + std::to_string(request_count) +
+       " requests")
+          .c_str());
+
+  // At this point we accept ownership of 'requests', which means that
+  // even if something goes wrong we must still return success from
+  // this function. If something does go wrong in processing a
+  // particular request then we send an error response just for the
+  // specific request.
+//  instance_state->ProcessRequests(requests, request_count);
+
+  return nullptr;  // success
+}
+
+}  // extern "C"
+
+}}}  // namespace triton::backend::openvino
diff --git a/tools/gen_openvino_dockerfile.py b/tools/gen_openvino_dockerfile.py
new file mode 100755
index 0000000..e658df6
--- /dev/null
+++ b/tools/gen_openvino_dockerfile.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--triton-container',
+                        type=str,
+                        required=True,
+                        help='Triton base container to use for build.')
+    parser.add_argument('--openvino-version',
+                        type=str,
+                        required=True,
+                        help='OpenVINO version.')
+    parser.add_argument('--output',
+                        type=str,
+                        required=True,
+                        help='File to write Dockerfile to.')
+
+    FLAGS = parser.parse_args()
+
+    df = '''
+ARG BASE_IMAGE={}
+ARG OPENVINO_VERSION={}
+'''.format(FLAGS.triton_container, FLAGS.openvino_version)
+
+    df += '''
+FROM ${BASE_IMAGE}
+
+# Ensure apt-get won't prompt for selecting options
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        patchelf
+
+WORKDIR /workspace
+
+ARG OPENVINO_VERSION
+ENV INTEL_OPENVINO_DIR /opt/intel/openvino_${OPENVINO_VERSION}.110
+ENV LD_LIBRARY_PATH $INTEL_OPENVINO_DIR/deployment_tools/inference_engine/lib/intel64:$INTEL_OPENVINO_DIR/deployment_tools/ngraph/lib:$INTEL_OPENVINO_DIR/deployment_tools/inference_engine/external/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH
+ENV PYTHONPATH $INTEL_OPENVINO_DIR/tools:$PYTHONPATH
+ENV IE_PLUGINS_PATH $INTEL_OPENVINO_DIR/deployment_tools/inference_engine/lib/intel64
+
+RUN wget https://apt.repos.intel.com/openvino/2021/GPG-PUB-KEY-INTEL-OPENVINO-2021 && \
+    apt-key add GPG-PUB-KEY-INTEL-OPENVINO-2021 && rm GPG-PUB-KEY-INTEL-OPENVINO-2021 && \
+    cd /etc/apt/sources.list.d && \
+    echo "deb https://apt.repos.intel.com/openvino/2021 all main">intel-openvino-2021.list && \
+    apt update && \
+    apt install -y intel-openvino-dev-ubuntu20-${OPENVINO_VERSION}.110
+# && \
+#    cd ${INTEL_OPENVINO_DIR}/install_dependencies && ./install_openvino_dependencies.sh
+
+ARG INTEL_COMPUTE_RUNTIME_URL=https://github.com/intel/compute-runtime/releases/download/19.41.14441
+RUN wget ${INTEL_COMPUTE_RUNTIME_URL}/intel-gmmlib_19.3.2_amd64.deb && \
+    wget ${INTEL_COMPUTE_RUNTIME_URL}/intel-igc-core_1.0.2597_amd64.deb && \
+    wget ${INTEL_COMPUTE_RUNTIME_URL}/intel-igc-opencl_1.0.2597_amd64.deb && \
+    wget ${INTEL_COMPUTE_RUNTIME_URL}/intel-opencl_19.41.14441_amd64.deb && \
+    wget ${INTEL_COMPUTE_RUNTIME_URL}/intel-ocloc_19.41.14441_amd64.deb && \
+    dpkg -i *.deb && rm -rf *.deb
+
+#
+# Copy all artifacts needed by the backend
+#
+WORKDIR /opt/openvino
+
+RUN mkdir -p /opt/openvino/lib && \
+    cp -r /opt/intel/openvino_${OPENVINO_VERSION}.110/licensing \
+          /opt/openvino/LICENSE.openvino && \
+    cp /opt/intel/openvino_${OPENVINO_VERSION}.110/deployment_tools/inference_engine/lib/intel64/libinference_engine.so \
+       /opt/openvino/lib && \
+    cp /opt/intel/openvino_${OPENVINO_VERSION}.110/deployment_tools/inference_engine/lib/intel64/libinference_engine_transformations.so \
+       /opt/openvino/lib && \
+    cp /opt/intel/openvino_${OPENVINO_VERSION}.110/deployment_tools/ngraph/lib/libngraph.so \
+       /opt/openvino/lib && \
+    cp /opt/intel/openvino_${OPENVINO_VERSION}.110/deployment_tools/inference_engine/external/tbb/lib/libtbb.so.2 \
+       /opt/openvino/lib && \
+    cp /opt/intel/openvino_${OPENVINO_VERSION}.110/deployment_tools/inference_engine/external/tbb/lib/libtbbmalloc.so.2 \
+       /opt/openvino/lib && \
+    cp /opt/intel/openvino_${OPENVINO_VERSION}.110/deployment_tools/inference_engine/lib/intel64/libMKLDNNPlugin.so \
+       /opt/openvino/lib && \
+    cp /opt/intel/openvino_${OPENVINO_VERSION}.110/deployment_tools/inference_engine/lib/intel64/libinference_engine_lp_transformations.so \
+       /opt/openvino/lib && \
+    cp /opt/intel/openvino_${OPENVINO_VERSION}.110/deployment_tools/inference_engine/lib/intel64/libinference_engine_ir_reader.so \
+       /opt/openvino/lib && \
+    cp /opt/intel/openvino_${OPENVINO_VERSION}.110/deployment_tools/inference_engine/lib/intel64/libinference_engine_onnx_reader.so \
+       /opt/openvino/lib && \
+    (cd /opt/openvino/lib && \
+     chmod a-x * && \
+     ln -sf libtbb.so.2 libtbb.so && \
+     ln -sf libtbbmalloc.so.2 libtbbmalloc.so)
+'''
+
+    df += '''
+RUN cd /opt/openvino/lib && \
+    for i in `find . -mindepth 1 -maxdepth 1 -type f -name '*\.so*'`; do \
+        patchelf --set-rpath '$ORIGIN' $i; \
+    done
+'''
+
+    with open(FLAGS.output, "w") as dfile:
+        dfile.write(df)