From aa4a18dda1ad0dfc55412dbe776be99dc0c0c964 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Wed, 9 Jun 2021 09:02:25 +0300 Subject: [PATCH] [IE CLDNN] Updated GPU device config (#6040) --- docs/IE_DG/API_Changes.md | 15 ++- docs/IE_DG/Extensibility_DG/GPU_Kernel.md | 16 --- docs/IE_DG/GPU_Kernels_Tuning.md | 39 ------ docs/IE_DG/Intro_to_Performance.md | 20 +-- .../supported_plugins/{CL_DNN.md => GPU.md} | 45 +++---- .../supported_plugins/Supported_Devices.md | 10 +- docs/doxygen/ie_docs.xml | 3 +- docs/model_server/README.md | 46 +++---- .../dldt_optimization_guide.md | 62 ++++----- docs/snippets/GPU_Kernel.cpp | 5 - docs/snippets/GPU_Kernels_Tuning.cpp | 14 -- docs/snippets/GPU_RemoteBlob_API2.cpp | 4 +- .../include/cldnn/cldnn_config.hpp | 80 +++--------- inference-engine/include/gpu/gpu_config.hpp | 120 ++++++++++++++++++ .../samples/benchmark_app/main.cpp | 4 +- .../samples/hello_query_device/README.md | 6 +- .../src/cldnn_engine/cldnn_config.cpp | 29 +++-- .../src/cldnn_engine/cldnn_engine.cpp | 2 +- .../cldnn_engine/cldnn_executable_network.cpp | 1 - .../src/cldnn_engine/cldnn_graph.cpp | 1 - .../cldnn_remote_blob_tests.cpp | 4 +- .../behavior/config.cpp | 35 ++++- .../behavior/core_integration.cpp | 2 +- .../behavior/infer_request_input.cpp | 2 +- .../behavior/infer_request_output.cpp | 2 +- .../behavior/test_plugin.cpp | 2 +- .../multi/gpu_remote_blob_tests.cpp | 2 +- .../single_layer_tests/tensor_iterator.cpp | 6 +- tools/benchmark/main.py | 2 +- 29 files changed, 301 insertions(+), 278 deletions(-) delete mode 100644 docs/IE_DG/GPU_Kernels_Tuning.md rename docs/IE_DG/supported_plugins/{CL_DNN.md => GPU.md} (62%) delete mode 100644 docs/snippets/GPU_Kernels_Tuning.cpp create mode 100644 inference-engine/include/gpu/gpu_config.hpp diff --git a/docs/IE_DG/API_Changes.md b/docs/IE_DG/API_Changes.md index a234471c13e550..2534a4a6c3856a 100644 --- a/docs/IE_DG/API_Changes.md +++ b/docs/IE_DG/API_Changes.md @@ -14,6 +14,15 @@ The sections below contain detailed list of changes made to the Inference Engine * InferenceEngine::Parameter(std::shared_ptr& var) * std::shared_ptr InferenceEngine::Parameter::asVariant() const * InferenceEngine::Parameter::operator std::shared_ptr() const + * KEY_CLDNN_NV12_TWO_INPUTS GPU plugin option. Use KEY_GPU_NV12_TWO_INPUTS instead + * KEY_CLDNN_PLUGIN_PRIORITY GPU plugin option. Use KEY_GPU_PLUGIN_PRIORITY instead + * KEY_CLDNN_PLUGIN_THROTTLE GPU plugin option. Use KEY_GPU_PLUGIN_THROTTLE instead + * KEY_CLDNN_MEM_POOL GPU plugin option + * KEY_CLDNN_GRAPH_DUMPS_DIR GPU plugin option + * KEY_CLDNN_SOURCES_DUMPS_DIR GPU plugin option + * KEY_DUMP_KERNELS GPU plugin option + * KEY_TUNING_MODE GPU plugin option + * KEY_TUNING_FILE GPU plugin option ## 2021.3 @@ -528,7 +537,7 @@ The sections below contain detailed list of changes made to the Inference Engine * DLIA_CONFIG_KEY(ENABLE_STREAMING) config key ### Removed API - + * InferenceEngine::EltwiseLayer::Select from InferenceEngine::EltwiseLayer::eOperation enumeration ## 2019 R2 @@ -577,7 +586,7 @@ The sections below contain detailed list of changes made to the Inference Engine * DLIA_CONFIG_KEY(IO_TRANSFORMATIONS_NATIVE) config key * DLIA_CONFIG_KEY(DUMP_SUPPORTED_LAYERS_INFORMATION) config key * GNA_CONFIG_VALUE(SW_FP32) config value for GNA_CONFIG_KEY(DEVICE_MODE) key - * MULTI_CONFIG_KEY(DEVICE_PRIORITIES) config key for `MULTI` device + * MULTI_CONFIG_KEY(DEVICE_PRIORITIES) config key for `MULTI` device * InferenceEngine::CNNNetReader::ReadNetwork(const std::wstring &filepath) new method * InferenceEngine::CNNNetReader::ReadWeights(const std::wstring &filepath) new method * InferenceEngine::ExecutableNetwork::ExecutableNetwork(IExecutableNetwork::Ptr actual, InferenceEnginePluginPtr plg) constructor with additional `plg` parameter @@ -593,7 +602,7 @@ The sections below contain detailed list of changes made to the Inference Engine * InferenceEngine::EltwiseLayer::Logical_NOT, InferenceEngine::EltwiseLayer::Mean, InferenceEngine::EltwiseLayer::Select extensions to InferenceEngine::EltwiseLayer::eOperation enumeration * InferenceEngine::OneHotLayer new class * InferenceEngine::SelectLayer new class - * InferenceEngine::BroadcastLayer new class + * InferenceEngine::BroadcastLayer new class * InferenceEngine::MathLayer new class * InferenceEngine::ReduceLayer new class * InferenceEngine::TopKLayer new class diff --git a/docs/IE_DG/Extensibility_DG/GPU_Kernel.md b/docs/IE_DG/Extensibility_DG/GPU_Kernel.md index 09ace6f0a2942f..d9fd809f8e4227 100644 --- a/docs/IE_DG/Extensibility_DG/GPU_Kernel.md +++ b/docs/IE_DG/Extensibility_DG/GPU_Kernel.md @@ -219,22 +219,6 @@ __kernel void example_relu_kernel( ## Debugging Tips -* **Dumping the Resulting Kernels**. -It is recommended to get a dump of the kernel with all of -the values set by the Inference Engine, such as tensor sizes, -floating-point, and integer kernel parameters. To get the dump, add the -following line to your code that configures the GPU plugin to output the -custom kernels: - -@snippet snippets/GPU_Kernel.cpp part1 - -When the Inference Engine compiles the kernels for the specific network, -it also outputs the resulting code for the custom kernels. In the -directory of your executable, find files like -`clDNN_program0.cl`, `clDNN_program1.cl`. There are as many files as -distinct sets of parameters for your custom kernel: different input -tensor sizes and kernel parameters. - * **Using `printf` in the OpenCL™ Kernels**. To debug the specific values, you can use `printf` in your kernels. However, be careful: for instance, do not output excessively diff --git a/docs/IE_DG/GPU_Kernels_Tuning.md b/docs/IE_DG/GPU_Kernels_Tuning.md deleted file mode 100644 index 5bb6a8334b2372..00000000000000 --- a/docs/IE_DG/GPU_Kernels_Tuning.md +++ /dev/null @@ -1,39 +0,0 @@ -Using GPU Kernels Tuning {#openvino_docs_IE_DG_GPU_Kernels_Tuning} -====================== - -GPU Kernels Tuning allows you to tune models, so the heavy computational layers are configured to fit better into -hardware, which the tuning was done on. It is required to achieve best performance on GPU. -> **NOTE** Currently only convolution and fully connected layers undergo tuning process. It means that the performance boost depends on the amount of that layers in the model. - -OpenVINO™ releases include the `/inference_engine/bin/intel64/Release/cache.json` file with pretuned data for current state of the art models. It is highly recommended to do the -tuning for new kind of models, hardwares or drivers. - -## Tuned data - -GPU tuning data is saved in JSON format. The file is composed of 2 types of attributes and 1 type of value: -* Execution units number (attribute): splits the content into different EU sections -* Hash (attribute): hashed tuned kernel data -* Key (value): Array with kernel name and kernel's mode index - -## Usage - ---- - -You can activate Kernels Tuning process by setting `KEY_TUNING_MODE` flag to `TUNING_CREATE` and `KEY_TUNING_FILE` to `<"filename">` in a configuration map that is -passed to the plugin while loading a network. -This configuration modifies the behavior of the `ExecutableNetwork` object. Instead of standard network compilation, it will run the tuning process. -Please keep in mind that the tuning can be very time consuming. The bigger the network, the longer it will take. -File with tuned data is the result of this step. - -> **NOTE** If a filename passed to `KEY_TUNING_FILE` points to existing tuned data and you are tuning a new model, then this file will be extended by new data. This allows you to extend existing `cache.json` provided in the OpenVINO™ release package. - -The example below shows how to set and use the key files: - -@snippet snippets/GPU_Kernels_Tuning.cpp part0 - ---- - -You can activate the inference with tuned data by setting `KEY_TUNING_MODE` flag to `TUNING_USE_EXISTING` and -`KEY_TUNING_FILE` flag to `<"filename">`. - -GPU backend will process the content of the file during network compilation to configure the OpenCL kernels for the best performance. diff --git a/docs/IE_DG/Intro_to_Performance.md b/docs/IE_DG/Intro_to_Performance.md index 78d5c59c417d0f..94d0173dbbe2b4 100644 --- a/docs/IE_DG/Intro_to_Performance.md +++ b/docs/IE_DG/Intro_to_Performance.md @@ -22,7 +22,7 @@ $ benchmark_app -m -enforcebf16=false Notice that for quantized (e.g. INT8) models the bfloat16 calculations (of the layers that remain in FP32) is disabled by default. Refer to the [CPU Plugin documentation](supported_plugins/CPU.md) for more details. -Similarly, the GPU device has a dedicated config key to enable FP16 execution of the layers that remain in FP32 in the quantized models (as the quantization is typically performed on the FP32 models), refer to the ENABLE_FP16_FOR_QUANTIZED_MODELS key in the [GPU Plugin documentation](supported_plugins/CL_DNN.md) +Similarly, the GPU device has a dedicated config key to enable FP16 execution of the layers that remain in FP32 in the quantized models (as the quantization is typically performed on the FP32 models), refer to the ENABLE_FP16_FOR_QUANTIZED_MODELS key in the [GPU Plugin documentation](supported_plugins/GPU.md) ## Latency vs. Throughput One way to increase computational efficiency is batching, which combines many (potentially tens) of @@ -72,30 +72,20 @@ Instead, it is possible to keep a separate infer request per camera or another s ## Benchmark App [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample is the best performance reference. -It has a lot of device-specific knobs, but the primary usage is as simple as: +It has a lot of device-specific knobs, but the primary usage is as simple as: ```bash $ ./benchmark_app –d GPU –m -i ``` -to measure the performance of the model on the GPU. +to measure the performance of the model on the GPU. Or ```bash $ ./benchmark_app –d CPU –m -i ``` to execute on the CPU instead. -For example, for the CPU throughput mode from the previous section, you can play with number of streams (`-nstreams` command-line param). -Try different values of the `-nstreams` argument from `1` to a number of CPU cores and find one that provides the best performance. For example, on a 8-core CPU, compare the `-nstreams 1` (which is a latency-oriented scenario) to the `2`, `4` and `8` streams. Notice that `benchmark_app` automatically queries/creates/runs number of requests required to saturate the given number of streams. +For example, for the CPU throughput mode from the previous section, you can play with number of streams (`-nstreams` command-line param). +Try different values of the `-nstreams` argument from `1` to a number of CPU cores and find one that provides the best performance. For example, on a 8-core CPU, compare the `-nstreams 1` (which is a latency-oriented scenario) to the `2`, `4` and `8` streams. Notice that `benchmark_app` automatically queries/creates/runs number of requests required to saturate the given number of streams. Finally, notice that when you don't specify number of streams with `-nstreams`, "AUTO" value for the streams is used, e.g. for the CPU this is [CPU_THROUGHPUT_AUTO](supported_plugins/CPU.md). You can spot the actual value behind "AUTO" for your machine in the application output. Notice that the "AUTO" number is not necessarily most optimal, so it is generally recommended to play either with the benchmark_app's "-nstreams" as described above, or via [new Workbench tool](@ref workbench_docs_Workbench_DG_Introduction).This allows you to simplify the app-logic, as you don't need to combine multiple inputs into a batch to achieve good CPU performance. Instead, it is possible to keep a separate infer request per camera or another source of input and process the requests in parallel using Async API. - -## Kernels Tuning for GPU - -GPU backend comes with a feature, that allows models tuning, so the workload is configured to fit better into hardware. - -Tuning is time consuming process, which internally execute every layer several (or even hundreds) times to find most performant configuration. - -This configuration is saved into json-formatted file, whose name can be passed as plugin param to network. GPU backend will process this data to configure kernels for the best performance. - -For more details about Kernels Tuning and How-To please refer to [GPU Kernels Tuning](GPU_Kernels_Tuning.md). diff --git a/docs/IE_DG/supported_plugins/CL_DNN.md b/docs/IE_DG/supported_plugins/GPU.md similarity index 62% rename from docs/IE_DG/supported_plugins/CL_DNN.md rename to docs/IE_DG/supported_plugins/GPU.md index 0216ae71d0dd36..cc12be98a121e1 100644 --- a/docs/IE_DG/supported_plugins/CL_DNN.md +++ b/docs/IE_DG/supported_plugins/GPU.md @@ -1,4 +1,4 @@ -GPU Plugin {#openvino_docs_IE_DG_supported_plugins_CL_DNN} +GPU Plugin {#openvino_docs_IE_DG_supported_plugins_GPU} ======= The GPU plugin uses the Intel® Compute Library for Deep Neural Networks (clDNN) to infer deep neural networks. @@ -89,13 +89,10 @@ Some layers are executed during the load time, not during the inference. One of The following layers are not accelerated on the GPU and executed on the host CPU instead: * Proposal -* SimplerNMS +* NonMaxSuppression * PriorBox * DetectionOutput -## Known Layers Limitations -* ROIPooling is supported for 'max' value of 'method' attribute. - ## Supported Configuration Parameters The plugin supports the configuration parameters listed below. @@ -107,31 +104,21 @@ When specifying key values as raw strings (that is, when using Python API), omit | `KEY_CACHE_DIR` | `""` | `""` | Specifies a directory where compiled OCL binaries can be cached. First model loading generates the cache, and all subsequent LoadNetwork calls use precompiled kernels which significantly improves load time. If empty - caching is disabled | | `KEY_PERF_COUNT` | `YES` / `NO` | `NO` | Collect performance counters during inference | | `KEY_CONFIG_FILE` | `" [ ...]"` | `""` | Load custom layer configuration files | -| `KEY_DUMP_KERNELS` | `YES` / `NO` | `NO` | Dump the final kernels used for custom layers | -| `KEY_TUNING_MODE` | `TUNING_DISABLED`
`TUNING_CREATE`
`TUNING_USE_EXISTING` | `TUNING_DISABLED` | Disable inference kernel tuning
Create tuning file (expect much longer runtime)
Use an existing tuning file | -| `KEY_TUNING_FILE` | `""` | `""` | Tuning file to create / use | -| `KEY_CLDNN_PLUGIN_PRIORITY` | `<0-3>` | `0` | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)
Higher value means higher priority for clDNN OpenCL queue. 0 disables the setting. | -| `KEY_CLDNN_PLUGIN_THROTTLE` | `<0-3>` | `0` | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)
Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. | -| `KEY_CLDNN_GRAPH_DUMPS_DIR` | `""` | `""` | clDNN graph optimizer stages dump output directory (in GraphViz format) | -| `KEY_CLDNN_SOURCES_DUMPS_DIR` | `""` | `""` | Final optimized clDNN OpenCL sources dump output directory | -| `KEY_GPU_THROUGHPUT_STREAMS` | `KEY_GPU_THROUGHPUT_AUTO`, or positive integer| 1 | Specifies a number of GPU "execution" streams for the throughput mode (upper bound for a number of inference requests that can be executed simultaneously).
This option is can be used to decrease GPU stall time by providing more effective load from several streams. Increasing the number of streams usually is more effective for smaller topologies or smaller input sizes. Note that your application should provide enough parallel slack (e.g. running many inference requests) to leverage full GPU bandwidth. Additional streams consume several times more GPU memory, so make sure the system has enough memory available to suit parallel stream execution. Multiple streams might also put additional load on CPU. If CPU load increases, it can be regulated by setting an appropriate `KEY_CLDNN_PLUGIN_THROTTLE` option value (see above). If your target system has relatively weak CPU, keep throttling low.
The default value is 1, which implies latency-oriented behavior.
`KEY_GPU_THROUGHPUT_AUTO` creates bare minimum of streams to improve the performance; this is the most portable option if you are not sure how many resources your target machine has (and what would be the optimal number of streams).
A positive integer value creates the requested number of streams. | +| `KEY_GPU_PLUGIN_PRIORITY` | `<0-3>` | `0` | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)
Higher value means higher priority for OpenCL queue. 0 disables the setting. | +| `KEY_GPU_PLUGIN_THROTTLE` | `<0-3>` | `0` | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)
Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. | +| `KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS` | `YES` / `NO` | `YES` | Allows using FP16+INT8 mixed precision mode, so non-quantized parts of a model will be executed in FP16 precision for FP16 IR. Does not affect quantized FP32 IRs | +| `KEY_GPU_NV12_TWO_INPUTS` | `YES` / `NO` | `NO` | Controls preprocessing logic for nv12 input. If it's set to YES, then device graph will expect that user will set biplanar nv12 blob as input wich will be directly passed to device execution graph. Otherwise, preprocessing via GAPI is used to convert NV12->BGR, thus GPU graph have to expect single input | +| `KEY_GPU_THROUGHPUT_STREAMS` | `KEY_GPU_THROUGHPUT_AUTO`, or positive integer| 1 | Specifies a number of GPU "execution" streams for the throughput mode (upper bound for a number of inference requests that can be executed simultaneously).
This option is can be used to decrease GPU stall time by providing more effective load from several streams. Increasing the number of streams usually is more effective for smaller topologies or smaller input sizes. Note that your application should provide enough parallel slack (e.g. running many inference requests) to leverage full GPU bandwidth. Additional streams consume several times more GPU memory, so make sure the system has enough memory available to suit parallel stream execution. Multiple streams might also put additional load on CPU. If CPU load increases, it can be regulated by setting an appropriate `KEY_GPU_PLUGIN_THROTTLE` option value (see above). If your target system has relatively weak CPU, keep throttling low.
The default value is 1, which implies latency-oriented behavior.
`KEY_GPU_THROUGHPUT_AUTO` creates bare minimum of streams to improve the performance; this is the most portable option if you are not sure how many resources your target machine has (and what would be the optimal number of streams).
A positive integer value creates the requested number of streams. | | `KEY_EXCLUSIVE_ASYNC_REQUESTS` | `YES` / `NO` | `NO` | Forces async requests (also from different executable networks) to execute serially.| -| `KEY_CLDNN_MAX_NUM_THREADS` | `integer value` | `maximum # of HW threads available in host environment` | Specifies the number of CPU threads that can be used for clDNN engine, e.g, JIT compilation of clDNN kernels or clDNN cpu kernel processing. The default value is set as the number of maximum available threads in host environment to minimize the time for LoadNetwork, where the clDNN kernel build time occupies a large portion. Note that if the specified value is larger than the maximum available # of threads or less than zero, it is set as maximum available # of threads. It can be specified with a smaller number than the available HW threads according to the usage scenario, e.g., when the user wants to assign more CPU threads while clDNN plugin is running. Note that setting this value with lower number will affect not only the network loading time but also the cpu layers of clDNN networks that are optimized with multi-threading. | -| `KEY_CLDNN_ENABLE_LOOP_UNROLLING` | `YES` / `NO` | `YES` | Enables recurrent layers such as TensorIterator or Loop with fixed iteration count to be unrolled. It is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). Note that turning this key on will increase the graph loading time in proportion to the iteration counts. Thus, this key should be turned off if graph loading time is considered to be most important target to optimize. | - -## Note on Debug Capabilities of the GPU Plugin - -Inference Engine GPU plugin provides possibility to dump the user custom OpenCL™ kernels to a file to allow you to properly debug compilation issues in your custom kernels. - -The application can use the SetConfig() function with the key PluginConfigParams::KEY_DUMP_KERNELS and value: PluginConfigParams::YES. Then during network loading, all custom layers will print their OpenCL kernels with the JIT instrumentation added by the plugin. -The kernels will be stored in the working directory under files named the following way: clDNN_program0.cl, clDNN_program1.cl. - -This option is disabled by default. Additionally, the application can call the SetConfig() function with the key PluginConfigParams::KEY_DUMP_KERNELS and value: PluginConfigParams::NO before network loading. - -How to verify that this option is disabled: -1. Delete all clDNN_program*.cl files from the current directory -2. Run your application to load a network -3. Examine the working directory for the presence of any kernel file (for example, clDNN_program0.cl) +| `KEY_GPU_MAX_NUM_THREADS` | `integer value` | `maximum # of HW threads available in host environment` | Specifies the number of CPU threads that can be used for GPU engine, e.g, JIT compilation of GPU kernels or cpu kernel processing within GPU plugin. The default value is set as the number of maximum available threads in host environment to minimize the time for LoadNetwork, where the GPU kernel build time occupies a large portion. Note that if the specified value is larger than the maximum available # of threads or less than zero, it is set as maximum available # of threads. It can be specified with a smaller number than the available HW threads according to the usage scenario, e.g., when the user wants to assign more CPU threads while GPU plugin is running. Note that setting this value with lower number will affect not only the network loading time but also the cpu layers of GPU networks that are optimized with multi-threading. | +| `KEY_GPU_ENABLE_LOOP_UNROLLING` | `YES` / `NO` | `YES` | Enables recurrent layers such as TensorIterator or Loop with fixed iteration count to be unrolled. It is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). Note that turning this key on will increase the graph loading time in proportion to the iteration counts. Thus, this key should be turned off if graph loading time is considered to be most important target to optimize. | +| `KEY_CLDNN_PLUGIN_PRIORITY` | `<0-3>` | `0` | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)
Higher value means higher priority for OpenCL queue. 0 disables the setting. **Deprecated**. Please use KEY_GPU_PLUGIN_PRIORITY | +| `KEY_CLDNN_PLUGIN_THROTTLE` | `<0-3>` | `0` | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)
Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. **Deprecated**. Please use KEY_GPU_PLUGIN_THROTTLE | +| `KEY_CLDNN_GRAPH_DUMPS_DIR` | `""` | `""` | clDNN graph optimizer stages dump output directory (in GraphViz format) **Deprecated**. Will be removed in the next release | +| `KEY_CLDNN_SOURCES_DUMPS_DIR` | `""` | `""` | Final optimized clDNN OpenCL sources dump output directory. **Deprecated**. Will be removed in the next release | +| `KEY_DUMP_KERNELS` | `YES` / `NO` | `NO` | Dump the final kernels used for custom layers. **Deprecated**. Will be removed in the next release | +| `KEY_TUNING_MODE` | `TUNING_DISABLED`
`TUNING_CREATE`
`TUNING_USE_EXISTING` | `TUNING_DISABLED` | Disable inference kernel tuning
Create tuning file (expect much longer runtime)
Use an existing tuning file. **Deprecated**. Will be removed in the next release | +| `KEY_TUNING_FILE` | `""` | `""` | Tuning file to create / use. **Deprecated**. Will be removed in the next release | ## GPU Context and Video Memory Sharing RemoteBlob API diff --git a/docs/IE_DG/supported_plugins/Supported_Devices.md b/docs/IE_DG/supported_plugins/Supported_Devices.md index ed8cabec076f03..e1140ae4b74cae 100644 --- a/docs/IE_DG/supported_plugins/Supported_Devices.md +++ b/docs/IE_DG/supported_plugins/Supported_Devices.md @@ -9,11 +9,11 @@ The Inference Engine provides unique capabilities to infer deep learning models | Plugin | Device types | |------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------| -|[GPU plugin](CL_DNN.md) |Intel® Processor Graphics, including Intel® HD Graphics and Intel® Iris® Graphics | +|[GPU plugin](GPU.md) |Intel® Processor Graphics, including Intel® HD Graphics and Intel® Iris® Graphics | |[CPU plugin](CPU.md) |Intel® Xeon® with Intel® Advanced Vector Extensions 2 (Intel® AVX2), Intel® Advanced Vector Extensions 512 (Intel® AVX-512), and AVX512_BF16, Intel® Core™ Processors with Intel® AVX2, Intel® Atom® Processors with Intel® Streaming SIMD Extensions (Intel® SSE) | |[VPU plugins](VPU.md) (available in the Intel® Distribution of OpenVINO™ toolkit) |Intel® Neural Compute Stick 2 powered by the Intel® Movidius™ Myriad™ X, Intel® Vision Accelerator Design with Intel® Movidius™ VPUs | |[GNA plugin](GNA.md) (available in the Intel® Distribution of OpenVINO™ toolkit) |Intel® Speech Enabling Developer Kit, Amazon Alexa* Premium Far-Field Developer Kit, Intel® Pentium® Silver J5005 Processor, Intel® Pentium® Silver N5000 Processor, Intel® Celeron® J4005 Processor, Intel® Celeron® J4105 Processor, Intel® Celeron® Processor N4100, Intel® Celeron® Processor N4000, Intel® Core™ i3-8121U Processor, Intel® Core™ i7-1065G7 Processor, Intel® Core™ i7-1060G7 Processor, Intel® Core™ i5-1035G4 Processor, Intel® Core™ i5-1035G7 Processor, Intel® Core™ i5-1035G1 Processor, Intel® Core™ i5-1030G7 Processor, Intel® Core™ i5-1030G4 Processor, Intel® Core™ i3-1005G1 Processor, Intel® Core™ i3-1000G1 Processor, Intel® Core™ i3-1000G4 Processor| -|[Multi-Device plugin](MULTI.md) |Multi-Device plugin enables simultaneous inference of the same network on several Intel® devices in parallel | +|[Multi-Device plugin](MULTI.md) |Multi-Device plugin enables simultaneous inference of the same network on several Intel® devices in parallel | |[Heterogeneous plugin](HETERO.md) |Heterogeneous plugin enables automatic inference splitting between several Intel® devices (for example if a device doesn't [support certain layers](#supported-layers)). | Devices similar to the ones we have used for benchmarking can be accessed using [Intel® DevCloud for the Edge](https://devcloud.intel.com/edge/), a remote development environment with access to Intel® hardware and the latest versions of the Intel® Distribution of the OpenVINO™ Toolkit. [Learn more](https://devcloud.intel.com/edge/get_started/devcloud/) or [Register here](https://inteliot.force.com/DevcloudForEdge/s/). @@ -60,7 +60,7 @@ For example, the CHW value at index (c,h,w) is physically located at index (c\*H |GNA plugin |Supported |Supported |Not supported |
\* - currently, only limited set of topologies might benefit from enabling I8 model on GPU
For [Multi-Device](MULTI.md) and [Heterogeneous](HETERO.md) execution -the supported models formats depends on the actual underlying devices. _Generally, FP16 is preferable as it is most ubiquitous and performant_. +the supported models formats depends on the actual underlying devices. _Generally, FP16 is preferable as it is most ubiquitous and performant_. ### Supported Input Precision @@ -73,7 +73,7 @@ the supported models formats depends on the actual underlying devices. _Generall
\* - Supported via `SetBlob` only, `GetBlob` returns FP32
For [Multi-Device](MULTI.md) and [Heterogeneous](HETERO.md) execution -the supported input precision depends on the actual underlying devices. _Generally, U8 is preferable as it is most ubiquitous_. +the supported input precision depends on the actual underlying devices. _Generally, U8 is preferable as it is most ubiquitous_. ### Supported Output Precision @@ -84,7 +84,7 @@ the supported input precision depends on the actual underlying devices. _Genera |VPU plugins |Supported |Supported | |GNA plugin |Supported |Not supported | For [Multi-Device](MULTI.md) and [Heterogeneous](HETERO.md) execution -the supported output precision depends on the actual underlying devices. _Generally, FP32 is preferable as it is most ubiquitous_. +the supported output precision depends on the actual underlying devices. _Generally, FP32 is preferable as it is most ubiquitous_. ### Supported Input Layout diff --git a/docs/doxygen/ie_docs.xml b/docs/doxygen/ie_docs.xml index f287487913d56a..bb006c9f01c630 100644 --- a/docs/doxygen/ie_docs.xml +++ b/docs/doxygen/ie_docs.xml @@ -293,7 +293,6 @@ limitations under the License. - @@ -303,7 +302,7 @@ limitations under the License. - + diff --git a/docs/model_server/README.md b/docs/model_server/README.md index ae5d03914ab347..e6c7144f3cb6c9 100644 --- a/docs/model_server/README.md +++ b/docs/model_server/README.md @@ -1,29 +1,29 @@ # OpenVINO™ Model Server {#openvino_docs_ovms} -OpenVINO™ Model Server (OVMS) is a scalable, high-performance solution for serving machine learning models optimized for Intel® architectures. -The server provides an inference service via gRPC or REST API - making it easy to deploy new algorithms and AI experiments using the same -architecture as [TensorFlow* Serving](https://github.com/tensorflow/serving) for any models trained in a framework that is supported -by [OpenVINO](https://software.intel.com/en-us/openvino-toolkit). +OpenVINO™ Model Server (OVMS) is a scalable, high-performance solution for serving machine learning models optimized for Intel® architectures. +The server provides an inference service via gRPC or REST API - making it easy to deploy new algorithms and AI experiments using the same +architecture as [TensorFlow* Serving](https://github.com/tensorflow/serving) for any models trained in a framework that is supported +by [OpenVINO](https://software.intel.com/en-us/openvino-toolkit). The server implements gRPC and REST API framework with data serialization and deserialization using TensorFlow Serving API, and OpenVINO™ as the inference execution provider. Model repositories may reside on a locally accessible file system (for example, NFS), Google Cloud Storage\* (GCS), Amazon S3\*, MinIO\*, or Azure Blob Storage\*. - + OVMS is now implemented in C++ and provides much higher scalability compared to its predecessor in the Python version. You can take advantage of all the power of Xeon® CPU capabilities or AI accelerators and expose it over the network interface. Read the [release notes](https://github.com/openvinotoolkit/model_server/releases) to find out what's new in the C++ version. Review the [Architecture Concept](https://github.com/openvinotoolkit/model_server/blob/main/docs/architecture.md) document for more details. -A few key features: +A few key features: - Support for multiple frameworks. Serve models trained in popular formats such as Caffe\*, TensorFlow\*, MXNet\*, and ONNX*. - Deploy new [model versions](https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md#model-version-policy) without changing client code. -- Support for AI accelerators including [Intel Movidius Myriad VPUs](../IE_DG/supported_plugins/VPU), -[GPU](../IE_DG/supported_plugins/CL_DNN), and [HDDL](../IE_DG/supported_plugins/HDDL). +- Support for AI accelerators including [Intel Movidius Myriad VPUs](../IE_DG/supported_plugins/VPU.md), +[GPU](../IE_DG/supported_plugins/GPU.md), and [HDDL](../IE_DG/supported_plugins/HDDL.md). - The server can be enabled both on [Bare Metal Hosts](https://github.com/openvinotoolkit/model_server/blob/main/docs/host.md) or in [Docker* containers](https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md). -- [Kubernetes deployments](https://github.com/openvinotoolkit/model_server/blob/main/deploy). The server can be deployed in a Kubernetes cluster allowing the inference service to scale horizontally and ensure high availability. -- [Model reshaping](https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md#model-reshaping). The server supports reshaping models in runtime. +- [Kubernetes deployments](https://github.com/openvinotoolkit/model_server/blob/main/deploy). The server can be deployed in a Kubernetes cluster allowing the inference service to scale horizontally and ensure high availability. +- [Model reshaping](https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md#model-reshaping). The server supports reshaping models in runtime. - [Model ensemble](https://github.com/openvinotoolkit/model_server/blob/main/docs/ensemble_scheduler.md) (preview). Connect multiple models to deploy complex processing solutions and reduce overhead of sending data back and forth. > **NOTE**: OVMS has been tested on CentOS\* and Ubuntu\*. Publicly released [Docker images](https://hub.docker.com/r/openvino/model_server) are based on CentOS. @@ -68,30 +68,30 @@ For more detailed guides on using the Model Server in various scenarios, visit t ## API Documentation -### GRPC +### GRPC -OpenVINO™ Model Server gRPC API is documented in the proto buffer files in [tensorflow_serving_api](https://github.com/tensorflow/serving/tree/r2.2/tensorflow_serving/apis). +OpenVINO™ Model Server gRPC API is documented in the proto buffer files in [tensorflow_serving_api](https://github.com/tensorflow/serving/tree/r2.2/tensorflow_serving/apis). -> **NOTE:** The implementations for `Predict`, `GetModelMetadata`, and `GetModelStatus` function calls are currently available. +> **NOTE:** The implementations for `Predict`, `GetModelMetadata`, and `GetModelStatus` function calls are currently available. > These are the most generic function calls and should address most of the usage scenarios. -[Predict proto](https://github.com/tensorflow/serving/blob/r2.2/tensorflow_serving/apis/predict.proto) defines two message specifications: `PredictRequest` and `PredictResponse` used while calling Prediction endpoint. -* `PredictRequest` specifies information about the model spec, that is name and version, and a map of input data serialized via +[Predict proto](https://github.com/tensorflow/serving/blob/r2.2/tensorflow_serving/apis/predict.proto) defines two message specifications: `PredictRequest` and `PredictResponse` used while calling Prediction endpoint. +* `PredictRequest` specifies information about the model spec, that is name and version, and a map of input data serialized via [TensorProto](https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/core/framework/tensor.proto) to a string format. -* `PredictResponse` includes a map of outputs serialized by +* `PredictResponse` includes a map of outputs serialized by [TensorProto](https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/core/framework/tensor.proto) and information about the used model spec. - + [Get Model Metadata proto](https://github.com/tensorflow/serving/blob/r2.2/tensorflow_serving/apis/get_model_metadata.proto) defines three message definitions used while calling Metadata endpoint: `SignatureDefMap`, `GetModelMetadataRequest`, `GetModelMetadataResponse`. A function call `GetModelMetadata` accepts model spec information as input and returns Signature Definition content in the format similar to TensorFlow Serving. [Get Model Status proto](https://github.com/tensorflow/serving/blob/r2.2/tensorflow_serving/apis/get_model_status.proto) defines three message definitions used while calling Status endpoint: - `GetModelStatusRequest`, `ModelVersionStatus`, `GetModelStatusResponse` that report all exposed versions including their state in their lifecycle. + `GetModelStatusRequest`, `ModelVersionStatus`, `GetModelStatusResponse` that report all exposed versions including their state in their lifecycle. Refer to the [example client code](https://github.com/openvinotoolkit/model_server/blob/main/example_client) to learn how to use this API and submit the requests using the gRPC interface. -Using the gRPC interface is recommended for optimal performance due to its faster implementation of input data deserialization. It enables you to achieve lower latency, especially with larger input messages like images. +Using the gRPC interface is recommended for optimal performance due to its faster implementation of input data deserialization. It enables you to achieve lower latency, especially with larger input messages like images. ### REST @@ -99,9 +99,9 @@ OpenVINO™ Model Server RESTful API follows the documentation from the [Ten Both row and column format of the requests are implemented. -> **NOTE**: Just like with gRPC, only the implementations for `Predict`, `GetModelMetadata`, and `GetModelStatus` function calls are currently available. +> **NOTE**: Just like with gRPC, only the implementations for `Predict`, `GetModelMetadata`, and `GetModelStatus` function calls are currently available. -Only the numerical data types are supported. +Only the numerical data types are supported. Review the exemplary clients below to find out more how to connect and run inference requests. @@ -110,9 +110,9 @@ REST API is recommended when the primary goal is in reducing the number of clien ## Known Limitations -* Currently, `Predict`, `GetModelMetadata`, and `GetModelStatus` calls are implemented using the TensorFlow Serving API. +* Currently, `Predict`, `GetModelMetadata`, and `GetModelStatus` calls are implemented using the TensorFlow Serving API. * `Classify`, `Regress`, and `MultiInference` are not included. -* `Output_filter` is not effective in the `Predict` call. All outputs defined in the model are returned to the clients. +* `Output_filter` is not effective in the `Predict` call. All outputs defined in the model are returned to the clients. ## OpenVINO Model Server Contribution Policy diff --git a/docs/optimization_guide/dldt_optimization_guide.md b/docs/optimization_guide/dldt_optimization_guide.md index e70c0365a4165c..9ece7fec93a628 100644 --- a/docs/optimization_guide/dldt_optimization_guide.md +++ b/docs/optimization_guide/dldt_optimization_guide.md @@ -2,13 +2,13 @@ ## Introduction -The purpose of this document is to give you performance-related insights to every step of the network deployment process. +The purpose of this document is to give you performance-related insights to every step of the network deployment process. For information on the general workflow, refer to the documentation in See Also. For an example Inference Engine API snippet, see Request-Based API and “GetBlob” Idiom. ### Deep Learning Inference Engine Overview -Deep Learning Inference Engine is a part of Intel® Deep Learning Deployment Toolkit (Intel® DL Deployment Toolkit) and OpenVINO™ toolkit. Inference Engine facilitates deployment of deep learning solutions by delivering a unified, device-agnostic API. +Deep Learning Inference Engine is a part of Intel® Deep Learning Deployment Toolkit (Intel® DL Deployment Toolkit) and OpenVINO™ toolkit. Inference Engine facilitates deployment of deep learning solutions by delivering a unified, device-agnostic API. Below, there are the three main steps of the deployment process: @@ -50,7 +50,7 @@ When evaluating performance of your model with the Inference Engine, you must me ### Latency vs. Throughput -In the asynchronous case (see Request-Based API and “GetBlob” Idiom), the performance of an individual infer request is usually of less concern. Instead, you typically execute multiple requests asynchronously and measure the throughput in images per second by dividing the number of images that were processed by the processing time. +In the asynchronous case (see Request-Based API and “GetBlob” Idiom), the performance of an individual infer request is usually of less concern. Instead, you typically execute multiple requests asynchronously and measure the throughput in images per second by dividing the number of images that were processed by the processing time. In contrast, for the latency-oriented tasks, the time to a single frame is more important. Refer to the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample, which allows latency vs. throughput measuring. @@ -114,23 +114,23 @@ The resulting IR precision, for instance, `FP16` or `FP32`, directly affects per ## Multi-Device Execution OpenVINO™ toolkit supports automatic multi-device execution, please see [MULTI-Device plugin description](../IE_DG/supported_plugins/MULTI.md). -In the next chapter you can find the device-specific tips, while this section covers few recommendations +In the next chapter you can find the device-specific tips, while this section covers few recommendations for the multi-device execution: -- MULTI usually performs best when the fastest device is specified first in the list of the devices. - This is particularly important when the parallelism is not sufficient +- MULTI usually performs best when the fastest device is specified first in the list of the devices. + This is particularly important when the parallelism is not sufficient (e.g. the number of request in the flight is not enough to saturate all devices). -- It is highly recommended to query the optimal number of inference requests directly from the instance of the ExecutionNetwork - (resulted from the LoadNetwork call with the specific multi-device configuration as a parameter). -Please refer to the code of the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample for details. -- Notice that for example CPU+GPU execution performs better with certain knobs +- It is highly recommended to query the optimal number of inference requests directly from the instance of the ExecutionNetwork + (resulted from the LoadNetwork call with the specific multi-device configuration as a parameter). +Please refer to the code of the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample for details. +- Notice that for example CPU+GPU execution performs better with certain knobs which you can find in the code of the same [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample. - One specific example is disabling GPU driver polling, which in turn requires multiple GPU streams (which is already a default for the GPU) to amortize slower + One specific example is disabling GPU driver polling, which in turn requires multiple GPU streams (which is already a default for the GPU) to amortize slower inference completion from the device to the host. -- Multi-device logic always attempts to save on the (e.g. inputs) data copies between device-agnostic, user-facing inference requests - and device-specific 'worker' requests that are being actually scheduled behind the scene. - To facilitate the copy savings, it is recommended to start the requests in the order that they were created +- Multi-device logic always attempts to save on the (e.g. inputs) data copies between device-agnostic, user-facing inference requests + and device-specific 'worker' requests that are being actually scheduled behind the scene. + To facilitate the copy savings, it is recommended to start the requests in the order that they were created (with ExecutableNetwork's CreateInferRequest). - + ## Device-Specific Optimizations @@ -171,7 +171,7 @@ Notice that on a multi-socket machine, the bare minimum of streams for a latency In addition, you can play with the batch size to find the throughput sweet spot. -If your application is hard or impossible to change in accordance with the multiple-requests logic, consider the "multiple-instance" trick to improve the throughput: +If your application is hard or impossible to change in accordance with the multiple-requests logic, consider the "multiple-instance" trick to improve the throughput: - For multi-socket execution, it is recommended to set [`KEY_CPU_THREADS_NUM`](../IE_DG/supported_plugins/CPU.md) to the number of cores per socket, and run as many instances of the application as you have sockets. - Similarly, for extremely lightweight networks (running faster than 1ms) and/or many-core machines (16+ cores), try limiting the number of CPU inference threads to just `#‍phys` cores and further, while trying to saturate the machine with running multiple instances of the application. @@ -186,15 +186,15 @@ Inference Engine relies on the [Compute Library for Deep Neural Networks (clDNN) - If your application is simultaneously using the inference on the CPU or otherwise loads the host heavily, make sure that the OpenCL driver threads do not starve. You can use [CPU configuration options](../IE_DG/supported_plugins/CPU.md) to limit number of inference threads for the CPU plugin. - In the GPU-only scenario, a GPU driver might occupy a CPU core with spin-looped polling for completion. If the _CPU_ utilization is a concern, consider the `KEY_CLDND_PLUGIN_THROTTLE` configuration option. -> **NOTE**: See the [Benchmark App Sample](../../inference-engine/samples/benchmark_app/README.md) code for a usage example. -Notice that while disabling the polling, this option might reduce the GPU performance, so usually this option is used with multiple [GPU streams](../IE_DG/supported_plugins/CL_DNN.md). +> **NOTE**: See the [Benchmark App Sample](../../inference-engine/samples/benchmark_app/README.md) code for a usage example. +Notice that while disabling the polling, this option might reduce the GPU performance, so usually this option is used with multiple [GPU streams](../IE_DG/supported_plugins/GPU.md). ### Intel® Movidius™ Myriad™ X Visual Processing Unit and Intel® Vision Accelerator Design with Intel® Movidius™ VPUs Since Intel® Movidius™ Myriad™ X Visual Processing Unit (Intel® Movidius™ Myriad™ 2 VPU) communicates with the host over USB, minimum four infer requests in flight are recommended to hide the data transfer costs. See Request-Based API and “GetBlob” Idiom and [Benchmark App Sample](../../inference-engine/samples/benchmark_app/README.md) for more information. -Intel® Vision Accelerator Design with Intel® Movidius™ VPUs requires to keep at least 32 inference requests in flight to fully saturate the device. +Intel® Vision Accelerator Design with Intel® Movidius™ VPUs requires to keep at least 32 inference requests in flight to fully saturate the device. ### FPGA @@ -274,7 +274,7 @@ The following tips are provided to give general guidance on optimizing execution - Generally, GPU performance is better on heavy kernels (like Convolutions) and large inputs. So if the network inference time is already too small (~1ms of execution time), using the GPU would unlikely give a boost. -- A typical strategy to start with is to test the CPU-only and GPU-only scenarios first (with samples this is plain `-d CPU` or `-d GPU`). If there are specific kernels that are not supported by the GPU, the best option to try is the `HETERO:GPU,CPU` that automatically applies default splitting (based on the plugins layers support). Then, you can play with the manual affinity settings (for example, to further minimize the number of subgraphs). +- A typical strategy to start with is to test the CPU-only and GPU-only scenarios first (with samples this is plain `-d CPU` or `-d GPU`). If there are specific kernels that are not supported by the GPU, the best option to try is the `HETERO:GPU,CPU` that automatically applies default splitting (based on the plugins layers support). Then, you can play with the manual affinity settings (for example, to further minimize the number of subgraphs). - The general affinity “rule of thumb” is to keep computationally-intensive kernels on the accelerator, and "glue" (or helper) kernels on the CPU. Notice that this includes the granularity considerations. For example, running some (custom) activation on the CPU would result in too many conversions. @@ -337,7 +337,7 @@ For inference on the CPU there are multiple threads binding options, see If you are building an app-level pipeline with third-party components like GStreamer*, the general guidance for NUMA machines is as follows: - Whenever possible, use at least one instance of the pipeline per NUMA node: - - Pin the _entire_ pipeline instance to the specific NUMA node at the outer-most level (for example, use Kubernetes* and/or `numactl` command with proper settings before actual GStreamer commands). + - Pin the _entire_ pipeline instance to the specific NUMA node at the outer-most level (for example, use Kubernetes* and/or `numactl` command with proper settings before actual GStreamer commands). - Disable any individual pinning by the pipeline components (e.g. set [CPU_BIND_THREADS to 'NO'](../IE_DG/supported_plugins/CPU.md)). - Limit each instance with respect to number of inference threads. Use [CPU_THREADS_NUM](../IE_DG/supported_plugins/CPU.md) or or other means (e.g. virtualization, Kubernetes*, etc), to avoid oversubscription. - If pinning instancing/pinning of the entire pipeline is not possible or desirable, relax the inference threads pinning to just 'NUMA'. @@ -416,7 +416,7 @@ If your application simultaneously executes multiple infer requests: - For FPGA and GPU, the actual work is serialized by a plugin and/or a driver anyway. -- Finally, for any VPU flavor, using multiple requests is a must for achieving good throughput. +- Finally, for any VPU flavor, using multiple requests is a must for achieving good throughput. In the Inference Engine, there is no notion of requests priorities. It is left to the user side (for example, not queuing the low priority infer request, until another higher priority is waiting). Notice that it would require additional logic to synchronize between executable networks (queues) in your application code. @@ -470,12 +470,12 @@ Example of Inference Engine calls: Notice that `Task_runNOThrow` is an Async API wrapper and it is executed in a different thread and triggers the Intel MKL-DNN execution: ![](../img/vtune_timeline.png) - + - In the Intel VTune Amplifier **Top-down view**, grouped by the **Task Domain**. Notice the `Task_runNoThrow` and `MKLDNN _INFER` that are bracketing the actual Intel MKL-DNN kernels execution: - + ![](../img/vtune_topdown_view.jpg) - + Similarly, you can use any GPU analysis in the Intel VTune Amplifier and get general correlation with Inference Engine API as well as the execution breakdown for OpenCL kernels. Just like with regular native application, further drill down in the counters is possible, however, this is mostly useful for optimizing custom kernels. Finally, with the Intel VTune Amplifier, the profiling is not limited to your user-level code (see the [corresponding section in the Intel® VTune™ Amplifier User's Guide](https://software.intel.com/en-us/vtune-amplifier-help-analyze-performance)). @@ -513,12 +513,12 @@ Since FPGA execution does not separate individual kernels, only bulk execution/d ``` subgraph1: 1. input preprocessing (mean data/FPGA):EXECUTED layerType: preprocessing realTime: 129 cpu: 129 -subgraph1: 2. input transfer to DDR:EXECUTED layerType: realTime: 201 cpu: 0 -subgraph1: 3. FPGA execute time:EXECUTED layerType: realTime: 3808 cpu: 0 subgraph1: 4. output transfer from DDR:EXECUTED layerType: realTime: 55 cpu: 0 -subgraph1: 5. FPGA output postprocessing:EXECUTED layerType: realTime: 7 cpu: 7 -subgraph1: 6. softmax/copy: EXECUTED layerType: realTime: 2 cpu: 2 -subgraph2: out_prob: NOT_RUN layerType: Output realTime: 0 cpu: 0 -subgraph2: prob: EXECUTED layerType: SoftMax realTime: 10 cpu: 10 +subgraph1: 2. input transfer to DDR:EXECUTED layerType: realTime: 201 cpu: 0 +subgraph1: 3. FPGA execute time:EXECUTED layerType: realTime: 3808 cpu: 0 subgraph1: 4. output transfer from DDR:EXECUTED layerType: realTime: 55 cpu: 0 +subgraph1: 5. FPGA output postprocessing:EXECUTED layerType: realTime: 7 cpu: 7 +subgraph1: 6. softmax/copy: EXECUTED layerType: realTime: 2 cpu: 2 +subgraph2: out_prob: NOT_RUN layerType: Output realTime: 0 cpu: 0 +subgraph2: prob: EXECUTED layerType: SoftMax realTime: 10 cpu: 10 Total time: 4212 microseconds ``` diff --git a/docs/snippets/GPU_Kernel.cpp b/docs/snippets/GPU_Kernel.cpp index 5f849eb6a6a6a9..8b21a79dfe27dd 100644 --- a/docs/snippets/GPU_Kernel.cpp +++ b/docs/snippets/GPU_Kernel.cpp @@ -1,5 +1,4 @@ #include -#include "cldnn/cldnn_config.hpp" int main() { using namespace InferenceEngine; @@ -9,9 +8,5 @@ InferenceEngine::Core core; core.SetConfig({ { InferenceEngine::PluginConfigParams::KEY_CONFIG_FILE, "" } }, "GPU"); //! [part0] -//! [part1] -core.SetConfig({ { PluginConfigParams::KEY_DUMP_KERNELS, PluginConfigParams::YES } }, "GPU"); -//! [part1] - return 0; } diff --git a/docs/snippets/GPU_Kernels_Tuning.cpp b/docs/snippets/GPU_Kernels_Tuning.cpp deleted file mode 100644 index 25daeec5e2a263..00000000000000 --- a/docs/snippets/GPU_Kernels_Tuning.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include -#include "cldnn/cldnn_config.hpp" - -int main() { -using namespace InferenceEngine; -//! [part0] -Core ie; - ie.SetConfig({{ CONFIG_KEY(TUNING_MODE), CONFIG_VALUE(TUNING_CREATE) }}, "GPU"); - ie.SetConfig({{ CONFIG_KEY(TUNING_FILE), "/path/to/tuning/file.json" }}, "GPU"); - // Further LoadNetwork calls will use the specified tuning parameters -//! [part0] - -return 0; -} diff --git a/docs/snippets/GPU_RemoteBlob_API2.cpp b/docs/snippets/GPU_RemoteBlob_API2.cpp index 1bb00c17e03e94..13597ae45617ba 100644 --- a/docs/snippets/GPU_RemoteBlob_API2.cpp +++ b/docs/snippets/GPU_RemoteBlob_API2.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include int main() { @@ -28,7 +28,7 @@ auto shared_va_context = gpu::make_shared_context(ie, "GPU", disp); // compile network within a shared context ExecutableNetwork executable_network = ie.LoadNetwork(network, shared_va_context, - { { CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS, + { { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, PluginConfigParams::YES } }); diff --git a/inference-engine/include/cldnn/cldnn_config.hpp b/inference-engine/include/cldnn/cldnn_config.hpp index cbc2aef0242101..3e5dc4cfb126f6 100644 --- a/inference-engine/include/cldnn/cldnn_config.hpp +++ b/inference-engine/include/cldnn/cldnn_config.hpp @@ -11,47 +11,11 @@ #pragma once #include "ie_plugin_config.hpp" +#include "ie_api.h" +#include "gpu/gpu_config.hpp" namespace InferenceEngine { -namespace Metrics { - -/** - * @def GPU_METRIC_KEY(name) - * @brief shortcut for defining GPU plugin metrics - */ -#define GPU_METRIC_KEY(name) METRIC_KEY(GPU_##name) -#define DECLARE_GPU_METRIC_KEY(name, ...) DECLARE_METRIC_KEY(GPU_##name, __VA_ARGS__) - -/** - * @def DECLARE_GPU_METRIC_VALUE(name) - * @brief shortcut for defining gpu metric values - */ -#define DECLARE_GPU_METRIC_VALUE(name) DECLARE_METRIC_VALUE(GPU_##name) - -/** - * @brief Metric which defines size of memory in bytes available for the device. For iGPU it returns host memory size, for dGPU - dedicated gpu memory size - */ -DECLARE_GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE, uint64_t); - -/** - * @brief Metric to get microarchitecture identifier in major.minor.revision format - */ -DECLARE_GPU_METRIC_KEY(UARCH_VERSION, std::string); - -/** - * @brief Metric to get count of execution units for current GPU - */ -DECLARE_GPU_METRIC_KEY(EXECUTION_UNITS_COUNT, int); - -/** - * @brief Possible return value for OPTIMIZATION_CAPABILITIES metric - * - "HW_MATMUL" - Defines if device has hardware block for matrix multiplication - */ -DECLARE_GPU_METRIC_VALUE(HW_MATMUL); - -} // namespace Metrics - /** * @brief GPU plugin configuration */ @@ -70,6 +34,7 @@ namespace CLDNNConfigParams { * this option should be used with an unsigned integer value (1 is lowest priority) * 0 means no priority hint is set and default queue is created. */ +INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::GPUConfigParams::GPU_PLUGIN_PRIORITY instead") DECLARE_CLDNN_CONFIG_KEY(PLUGIN_PRIORITY); /** @@ -78,22 +43,26 @@ DECLARE_CLDNN_CONFIG_KEY(PLUGIN_PRIORITY); * chapter 9.19. This option should be used with an unsigned integer value (1 is lowest energy consumption) * 0 means no throttle hint is set and default queue created. */ +INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::GPUConfigParams::GPU_PLUGIN_THROTTLE instead") DECLARE_CLDNN_CONFIG_KEY(PLUGIN_THROTTLE); /** * @brief This key controls clDNN memory pool optimization. * Turned off by default. */ +INFERENCE_ENGINE_DEPRECATED("The config key will be removed") DECLARE_CLDNN_CONFIG_KEY(MEM_POOL); /** * @brief This key defines the directory name to which clDNN graph visualization will be dumped. */ +INFERENCE_ENGINE_DEPRECATED("The config key will be removed") DECLARE_CLDNN_CONFIG_KEY(GRAPH_DUMPS_DIR); /** * @brief This key defines the directory name to which full program sources will be dumped. */ +INFERENCE_ENGINE_DEPRECATED("The config key will be removed") DECLARE_CLDNN_CONFIG_KEY(SOURCES_DUMPS_DIR); /** @@ -108,43 +77,19 @@ DECLARE_CLDNN_CONFIG_KEY(ENABLE_FP16_FOR_QUANTIZED_MODELS); * @brief This key should be set to correctly handle NV12 input without pre-processing. * Turned off by default. */ +INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::GPUConfigParams::GPU_NV12_TWO_INPUTS instead") DECLARE_CLDNN_CONFIG_KEY(NV12_TWO_INPUTS); -/** - * @brief This key sets the max number of host threads that can be used by GPU plugin on model loading. - * Default value is maximum number of threads available in the environment. - */ -DECLARE_CLDNN_CONFIG_KEY(MAX_NUM_THREADS); - -/** - * @brief Turning on this key enables to unroll recurrent layers such as TensorIterator or Loop with fixed iteration count. - * This key is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). - * Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). - * Note that turning this key on will increase the graph loading time in proportion to the iteration counts. - * Thus, this key should be turned off if graph loading time is considered to be most important target to optimize.*/ -DECLARE_CLDNN_CONFIG_KEY(ENABLE_LOOP_UNROLLING); - } // namespace CLDNNConfigParams namespace PluginConfigParams { -/** - * @brief Optimize GPU plugin execution to maximize throughput. - * - * It is passed to Core::SetConfig(), this option should be used with values: - * - KEY_GPU_THROUGHPUT_AUTO creates bare minimum of streams that might improve performance in some cases, - * this option allows to enable throttle hint for opencl queue thus reduce CPU load without significant performance - * drop - * - a positive integer value creates the requested number of streams - */ -DECLARE_CONFIG_VALUE(GPU_THROUGHPUT_AUTO); -DECLARE_CONFIG_KEY(GPU_THROUGHPUT_STREAMS); - /** * @brief This key enables dumping of the kernels used by the plugin for custom layers. * * This option should be used with values: PluginConfigParams::YES or PluginConfigParams::NO (default) */ +INFERENCE_ENGINE_DEPRECATED("The config key will be removed") DECLARE_CONFIG_KEY(DUMP_KERNELS); /** @@ -159,17 +104,24 @@ DECLARE_CONFIG_KEY(DUMP_KERNELS); * * For values TUNING_CREATE and TUNING_RETUNE the file will be created if it does not exist. */ +INFERENCE_ENGINE_DEPRECATED("The config key will be removed") DECLARE_CONFIG_KEY(TUNING_MODE); +INFERENCE_ENGINE_DEPRECATED("The config value will be removed") DECLARE_CONFIG_VALUE(TUNING_CREATE); +INFERENCE_ENGINE_DEPRECATED("The config value will be removed") DECLARE_CONFIG_VALUE(TUNING_USE_EXISTING); +INFERENCE_ENGINE_DEPRECATED("The config value will be removed") DECLARE_CONFIG_VALUE(TUNING_DISABLED); +INFERENCE_ENGINE_DEPRECATED("The config value will be removed") DECLARE_CONFIG_VALUE(TUNING_UPDATE); +INFERENCE_ENGINE_DEPRECATED("The config value will be removed") DECLARE_CONFIG_VALUE(TUNING_RETUNE); /** * @brief This key defines the tuning data filename to be created/used */ +INFERENCE_ENGINE_DEPRECATED("The config key will be removed") DECLARE_CONFIG_KEY(TUNING_FILE); } // namespace PluginConfigParams diff --git a/inference-engine/include/gpu/gpu_config.hpp b/inference-engine/include/gpu/gpu_config.hpp new file mode 100644 index 00000000000000..96f8754ac8660a --- /dev/null +++ b/inference-engine/include/gpu/gpu_config.hpp @@ -0,0 +1,120 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @brief A header for advanced hardware related properties for GPU plugin + * To use in SetConfig() method of plugins + * + * @file gpu_config.hpp + */ +#pragma once + +#include "ie_plugin_config.hpp" + +namespace InferenceEngine { + +namespace Metrics { + +/** + * @def GPU_METRIC_KEY(name) + * @brief shortcut for defining GPU plugin metrics + */ +#define GPU_METRIC_KEY(name) METRIC_KEY(GPU_##name) +#define DECLARE_GPU_METRIC_KEY(name, ...) DECLARE_METRIC_KEY(GPU_##name, __VA_ARGS__) + +/** + * @def DECLARE_GPU_METRIC_VALUE(name) + * @brief shortcut for defining gpu metric values + */ +#define DECLARE_GPU_METRIC_VALUE(name) DECLARE_METRIC_VALUE(GPU_##name) + +/** + * @brief Metric which defines size of memory in bytes available for the device. For iGPU it returns host memory size, for dGPU - dedicated gpu memory size + */ +DECLARE_GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE, uint64_t); + +/** + * @brief Metric to get microarchitecture identifier in major.minor.revision format + */ +DECLARE_GPU_METRIC_KEY(UARCH_VERSION, std::string); + +/** + * @brief Metric to get count of execution units for current GPU + */ +DECLARE_GPU_METRIC_KEY(EXECUTION_UNITS_COUNT, int); + +/** + * @brief Possible return value for OPTIMIZATION_CAPABILITIES metric + * - "HW_MATMUL" - Defines if device has hardware block for matrix multiplication + */ +DECLARE_GPU_METRIC_VALUE(HW_MATMUL); + +} // namespace Metrics + +/** + * @brief GPU plugin configuration + */ +namespace GPUConfigParams { + +/** + * @brief shortcut for defining configuration keys + */ +#define GPU_CONFIG_KEY(name) InferenceEngine::GPUConfigParams::_CONFIG_KEY(GPU_##name) +#define DECLARE_GPU_CONFIG_KEY(name) DECLARE_CONFIG_KEY(GPU_##name) +#define DECLARE_GPU_CONFIG_VALUE(name) DECLARE_CONFIG_VALUE(GPU_##name) + +/** + * @brief This key instructs the GPU plugin to use the OpenCL queue priority hint + * as defined in https://www.khronos.org/registry/OpenCL/specs/opencl-2.1-extensions.pdf + * this option should be used with an unsigned integer value (1 is lowest priority) + * 0 means no priority hint is set and default queue is created. + */ +DECLARE_GPU_CONFIG_KEY(PLUGIN_PRIORITY); + +/** + * @brief This key instructs the GPU plugin to use throttle hints the OpenCL queue throttle hint + * as defined in https://www.khronos.org/registry/OpenCL/specs/opencl-2.1-extensions.pdf, + * chapter 9.19. This option should be used with an unsigned integer value (1 is lowest energy consumption) + * 0 means no throttle hint is set and default queue created. + */ +DECLARE_GPU_CONFIG_KEY(PLUGIN_THROTTLE); + +/** + * @brief This key should be set to correctly handle NV12 input without pre-processing. + * Turned off by default. + */ +DECLARE_GPU_CONFIG_KEY(NV12_TWO_INPUTS); + +/** + * @brief This key sets the max number of host threads that can be used by GPU plugin on model loading. + * Default value is maximum number of threads available in the environment. + */ +DECLARE_GPU_CONFIG_KEY(MAX_NUM_THREADS); + +/** + * @brief Turning on this key enables to unroll recurrent layers such as TensorIterator or Loop with fixed iteration count. + * This key is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). + * Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). + * Note that turning this key on will increase the graph loading time in proportion to the iteration counts. + * Thus, this key should be turned off if graph loading time is considered to be most important target to optimize.*/ +DECLARE_GPU_CONFIG_KEY(ENABLE_LOOP_UNROLLING); + +} // namespace GPUConfigParams + +namespace PluginConfigParams { + +/** + * @brief Optimize GPU plugin execution to maximize throughput. + * + * It is passed to Core::SetConfig(), this option should be used with values: + * - KEY_GPU_THROUGHPUT_AUTO creates bare minimum of streams that might improve performance in some cases, + * this option allows to enable throttle hint for opencl queue thus reduce CPU load without significant performance + * drop + * - a positive integer value creates the requested number of streams + */ +DECLARE_CONFIG_VALUE(GPU_THROUGHPUT_AUTO); +DECLARE_CONFIG_KEY(GPU_THROUGHPUT_STREAMS); +} // namespace PluginConfigParams + +} // namespace InferenceEngine diff --git a/inference-engine/samples/benchmark_app/main.cpp b/inference-engine/samples/benchmark_app/main.cpp index 849dc05ad3344a..cd7ddc641dc256 100644 --- a/inference-engine/samples/benchmark_app/main.cpp +++ b/inference-engine/samples/benchmark_app/main.cpp @@ -4,8 +4,8 @@ #include #include -#include #include +#include #include #include #include @@ -282,7 +282,7 @@ int main(int argc, char* argv[]) { << "which releases another CPU thread (that is otherwise " "used by the GPU driver for active polling)" << slog::endl; - device_config[CLDNN_CONFIG_KEY(PLUGIN_THROTTLE)] = "1"; + device_config[GPU_CONFIG_KEY(PLUGIN_THROTTLE)] = "1"; } } else if (device == "MYRIAD") { device_config[CONFIG_KEY(LOG_LEVEL)] = CONFIG_VALUE(LOG_WARNING); diff --git a/inference-engine/samples/hello_query_device/README.md b/inference-engine/samples/hello_query_device/README.md index a185147f8ec50e..059077c48ad6b4 100644 --- a/inference-engine/samples/hello_query_device/README.md +++ b/inference-engine/samples/hello_query_device/README.md @@ -63,20 +63,20 @@ Available devices: SUPPORTED_METRICS : [ AVAILABLE_DEVICES SUPPORTED_METRICS FULL_DEVICE_NAME OPTIMIZATION_CAPABILITIES SUPPORTED_CONFIG_KEYS RANGE_FOR_ASYNC_INFER_REQUESTS RANGE_FOR_STREAMS ] FULL_DEVICE_NAME : Intel(R) UHD Graphics 620 (iGPU) OPTIMIZATION_CAPABILITIES : [ FP32 BIN FP16 ] - SUPPORTED_CONFIG_KEYS : [ CACHE_DIR CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS CLDNN_GRAPH_DUMPS_DIR CLDNN_MAX_NUM_THREADS CLDNN_MEM_POOL CLDNN_NV12_TWO_INPUTS CLDNN_PLUGIN_PRIORITY CLDNN_PLUGIN_THROTTLE CLDNN_SOURCES_DUMPS_DIR CLDNN_ENABLE_LOOP_UNROLLING CONFIG_FILE DEVICE_ID DUMP_KERNELS DYN_BATCH_ENABLED EXCLUSIVE_ASYNC_REQUESTS GPU_THROUGHPUT_STREAMS PERF_COUNT TUNING_FILE TUNING_MODE ] + SUPPORTED_CONFIG_KEYS : [ CACHE_DIR CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS CLDNN_GRAPH_DUMPS_DIR GPU_MAX_NUM_THREADS CLDNN_MEM_POOL CLDNN_NV12_TWO_INPUTS CLDNN_PLUGIN_PRIORITY CLDNN_PLUGIN_THROTTLE CLDNN_SOURCES_DUMPS_DIR GPU_ENABLE_LOOP_UNROLLING CONFIG_FILE DEVICE_ID DUMP_KERNELS DYN_BATCH_ENABLED EXCLUSIVE_ASYNC_REQUESTS GPU_THROUGHPUT_STREAMS PERF_COUNT TUNING_FILE TUNING_MODE ] RANGE_FOR_ASYNC_INFER_REQUESTS : { 1, 2, 1 } RANGE_FOR_STREAMS : { 1, 2 } Default values for device configuration keys: CACHE_DIR : "" CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS : YES CLDNN_GRAPH_DUMPS_DIR : "" - CLDNN_MAX_NUM_THREADS : 8 CLDNN_MEM_POOL : YES CLDNN_NV12_TWO_INPUTS : NO CLDNN_PLUGIN_PRIORITY : 0 CLDNN_PLUGIN_THROTTLE : 0 CLDNN_SOURCES_DUMPS_DIR : "" - CLDNN_ENABLE_LOOP_UNROLLING : YES + GPU_MAX_NUM_THREADS : 8 + GPU_ENABLE_LOOP_UNROLLING : YES CONFIG_FILE : "" DEVICE_ID : "" DUMP_KERNELS : NO diff --git a/inference-engine/src/cldnn_engine/cldnn_config.cpp b/inference-engine/src/cldnn_engine/cldnn_config.cpp index ff5d96935224cb..3de19bdff87dcc 100644 --- a/inference-engine/src/cldnn_engine/cldnn_config.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_config.cpp @@ -5,6 +5,7 @@ #include #include +#include #include "cldnn_config.h" #include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" #include "ie_api.h" @@ -39,6 +40,7 @@ static void createDirectory(std::string _path) { } } +IE_SUPPRESS_DEPRECATED_START void Config::UpdateFromMap(const std::map& configMap) { OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Config::UpdateFromMap"); for (auto& kvp : configMap) { @@ -69,7 +71,8 @@ void Config::UpdateFromMap(const std::map& configMap) } else { IE_THROW(NotFound) << "Unsupported property value by plugin: " << val; } - } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY) == 0) { + } else if (key.compare(GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY) == 0 || + key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY) == 0) { std::stringstream ss(val); uint32_t uVal(0); ss >> uVal; @@ -93,7 +96,8 @@ void Config::UpdateFromMap(const std::map& configMap) IE_THROW(ParameterMismatch) << "Unsupported queue priority value: " << uVal; } - } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE) == 0) { + } else if (key.compare(GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE) == 0 || + key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE) == 0) { std::stringstream ss(val); uint32_t uVal(0); ss >> uVal; @@ -205,7 +209,8 @@ void Config::UpdateFromMap(const std::map& configMap) } else { IE_THROW(NotFound) << "Unsupported property value by plugin: " << val; } - } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS) == 0) { + } else if (key.compare(GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS) == 0 || + key.compare(CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS) == 0) { if (val.compare(PluginConfigParams::YES) == 0) { nv12_two_inputs = true; } else if (val.compare(PluginConfigParams::NO) == 0) { @@ -221,7 +226,7 @@ void Config::UpdateFromMap(const std::map& configMap) } else { IE_THROW(NotFound) << "Unsupported KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS flag value: " << val; } - } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS) == 0) { + } else if (key.compare(GPUConfigParams::KEY_GPU_MAX_NUM_THREADS) == 0) { int max_threads = std::max(1, static_cast(std::thread::hardware_concurrency())); try { int val_i = std::stoi(val); @@ -231,17 +236,17 @@ void Config::UpdateFromMap(const std::map& configMap) n_threads = val_i; } } catch (const std::exception&) { - IE_THROW() << "Wrong value for property key " << CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS << ": " << val + IE_THROW() << "Wrong value for property key " << GPUConfigParams::KEY_GPU_MAX_NUM_THREADS << ": " << val << "\nSpecify the number of threads use for build as an integer." << "\nOut of range value will be set as a default value, maximum concurrent threads."; } - } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING) == 0) { + } else if (key.compare(GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING) == 0) { if (val.compare(PluginConfigParams::YES) == 0) { enable_loop_unrolling = true; } else if (val.compare(PluginConfigParams::NO) == 0) { enable_loop_unrolling = false; } else { - IE_THROW(ParameterMismatch) << "Unsupported KEY_CLDNN_ENABLE_LOOP_UNROLLING flag value: " << val; + IE_THROW(ParameterMismatch) << "Unsupported KEY_GPU_ENABLE_LOOP_UNROLLING flag value: " << val; } } else { IE_THROW(NotFound) << "Unsupported property key by plugin: " << key; @@ -297,6 +302,7 @@ void Config::adjustKeyMapValues() { default: break; } key_config_map[CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY] = qp; + key_config_map[GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY] = qp; } { std::string qt = "0"; @@ -307,6 +313,7 @@ void Config::adjustKeyMapValues() { default: break; } key_config_map[CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE] = qt; + key_config_map[GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE] = qt; } { std::string tm = PluginConfigParams::TUNING_DISABLED; @@ -328,11 +335,13 @@ void Config::adjustKeyMapValues() { key_config_map[PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS] = std::to_string(throughput_streams); key_config_map[PluginConfigParams::KEY_DEVICE_ID] = device_id; key_config_map[PluginConfigParams::KEY_CONFIG_FILE] = ""; - key_config_map[CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS] = std::to_string(n_threads); + key_config_map[GPUConfigParams::KEY_GPU_MAX_NUM_THREADS] = std::to_string(n_threads); if (enable_loop_unrolling) - key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING] = PluginConfigParams::YES; + key_config_map[GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING] = PluginConfigParams::YES; else - key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING] = PluginConfigParams::NO; + key_config_map[GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING] = PluginConfigParams::NO; } +IE_SUPPRESS_DEPRECATED_END + } // namespace CLDNNPlugin diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.cpp b/inference-engine/src/cldnn_engine/cldnn_engine.cpp index 86b9f2e4b9526b..171919a80775d6 100644 --- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp @@ -79,7 +79,7 @@ #include "cldnn_executable_network.h" #include "cldnn_custom_layer.h" #include "cldnn_itt.h" -#include "cldnn/cldnn_config.hpp" +#include "gpu/gpu_config.hpp" #ifdef __linux__ # include diff --git a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp index c2289fa9fb0bde..5191da35c2e369 100644 --- a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp @@ -16,7 +16,6 @@ #include "cldnn_itt.h" #include -#include #include "cldnn_infer_request.h" #include #include "cldnn_async_infer_request.h" diff --git a/inference-engine/src/cldnn_engine/cldnn_graph.cpp b/inference-engine/src/cldnn_engine/cldnn_graph.cpp index 04d40c9815d00d..1f835d8ac2c991 100644 --- a/inference-engine/src/cldnn_engine/cldnn_graph.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_graph.cpp @@ -16,7 +16,6 @@ #include "cldnn_graph.h" #include "simple_math.h" #include -#include #include "cldnn_infer_request.h" #include #include diff --git a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp index e6415688de2484..6837c0b84c33a8 100644 --- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp +++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp @@ -9,7 +9,7 @@ #include -#include +#include #include #include #include @@ -175,7 +175,7 @@ TEST_P(BatchedBlob_Test, canInputNV12) { /* XXX: is it correct to set KEY_CLDNN_NV12_TWO_INPUTS in case of remote blob? */ auto exec_net_b = ie.LoadNetwork(net_remote, CommonTestUtils::DEVICE_GPU, - { { CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS, PluginConfigParams::YES} }); + { { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, PluginConfigParams::YES} }); auto inf_req_remote = exec_net_b.CreateInferRequest(); auto cldnn_context = exec_net_b.GetContext(); cl_context ctx = std::dynamic_pointer_cast(cldnn_context)->get(); diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/config.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/config.cpp index e21d610db569cb..a8c039e43915a4 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/config.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/config.cpp @@ -4,6 +4,7 @@ #include "behavior/config.hpp" #include "cldnn/cldnn_config.hpp" +#include "gpu/gpu_config.hpp" using namespace BehaviorTestsDefinitions; namespace { @@ -12,6 +13,7 @@ namespace { InferenceEngine::Precision::FP16 }; + IE_SUPPRESS_DEPRECATED_START const std::vector> inconfigs = { {{InferenceEngine::PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, "OFF"}}, {{InferenceEngine::PluginConfigParams::KEY_PERF_COUNT, "ON"}}, @@ -46,6 +48,7 @@ namespace { {{InferenceEngine::KEY_AUTO_DEVICE_LIST , CommonTestUtils::DEVICE_GPU}, {InferenceEngine::PluginConfigParams::KEY_DEVICE_ID, "DEVICE_UNKNOWN"}} }; + IE_SUPPRESS_DEPRECATED_END INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, IncorrectConfigTests, ::testing::Combine( @@ -73,6 +76,29 @@ namespace { {} }; + IE_SUPPRESS_DEPRECATED_START + const std::vector> conf_gpu = { + // Deprecated + {{InferenceEngine::CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS, InferenceEngine::PluginConfigParams::YES}}, + {{InferenceEngine::CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS, InferenceEngine::PluginConfigParams::NO}}, + {{InferenceEngine::CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE, "0"}}, + {{InferenceEngine::CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE, "1"}}, + {{InferenceEngine::CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY, "0"}}, + {{InferenceEngine::CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY, "1"}}, + + {{InferenceEngine::GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, InferenceEngine::PluginConfigParams::YES}}, + {{InferenceEngine::GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, InferenceEngine::PluginConfigParams::NO}}, + {{InferenceEngine::GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE, "0"}}, + {{InferenceEngine::GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE, "1"}}, + {{InferenceEngine::GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY, "0"}}, + {{InferenceEngine::GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY, "1"}}, + {{InferenceEngine::GPUConfigParams::KEY_GPU_MAX_NUM_THREADS, "1"}}, + {{InferenceEngine::GPUConfigParams::KEY_GPU_MAX_NUM_THREADS, "4"}}, + {{InferenceEngine::GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING, InferenceEngine::PluginConfigParams::YES}}, + {{InferenceEngine::GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING, InferenceEngine::PluginConfigParams::NO}}, + }; + IE_SUPPRESS_DEPRECATED_END + const std::vector> multiconf = { {{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES , CommonTestUtils::DEVICE_GPU}} }; @@ -92,6 +118,13 @@ namespace { ::testing::ValuesIn(conf)), CorrectConfigAPITests::getTestCaseName); + INSTANTIATE_TEST_CASE_P(smoke_GPU_BehaviorTests, CorrectConfigAPITests, + ::testing::Combine( + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_GPU), + ::testing::ValuesIn(conf_gpu)), + CorrectConfigAPITests::getTestCaseName); + INSTANTIATE_TEST_CASE_P(smoke_Multi_BehaviorTests, CorrectConfigAPITests, ::testing::Combine( ::testing::ValuesIn(netPrecisions), @@ -142,4 +175,4 @@ namespace { IncorrectConfigAPITests::getTestCaseName); -} // namespace \ No newline at end of file +} // namespace diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_integration.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_integration.cpp index 3765c75864fa5f..68b23831e47944 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_integration.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_integration.cpp @@ -11,7 +11,7 @@ #endif #include "gpu/gpu_context_api_ocl.hpp" -#include "cldnn/cldnn_config.hpp" +#include "gpu/gpu_config.hpp" using namespace BehaviorTestsDefinitions; diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request_input.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request_input.cpp index e15ea827caa814..59f4dd21677c24 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request_input.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request_input.cpp @@ -3,7 +3,7 @@ // #include "behavior/infer_request_input.hpp" -#include "cldnn/cldnn_config.hpp" +#include "gpu/gpu_config.hpp" using namespace BehaviorTestsDefinitions; namespace { diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request_output.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request_output.cpp index 6c38f5c841c0b0..1135f6d9f7d8e4 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request_output.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request_output.cpp @@ -3,7 +3,7 @@ // #include "behavior/infer_request_output.hpp" -#include "cldnn/cldnn_config.hpp" +#include "gpu/gpu_config.hpp" using namespace BehaviorTestsDefinitions; namespace { diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/test_plugin.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/test_plugin.cpp index 51979116646939..729bf57c64ac8a 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/test_plugin.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/test_plugin.cpp @@ -3,7 +3,7 @@ // #include "behavior/test_plugin.hpp" -#include "cldnn/cldnn_config.hpp" +#include "gpu/gpu_config.hpp" using namespace BehaviorTestsDefinitions; namespace { diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/multi/gpu_remote_blob_tests.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/multi/gpu_remote_blob_tests.cpp index da308c032e258b..4fffb2cad6e157 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/multi/gpu_remote_blob_tests.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/multi/gpu_remote_blob_tests.cpp @@ -4,7 +4,7 @@ #include #include -#include "cldnn/cldnn_config.hpp" +#include "gpu/gpu_config.hpp" #include "multi/multi_remote_blob_tests.hpp" #include "common_test_utils/test_constants.hpp" diff --git a/inference-engine/tests/functional/plugin/gpu/single_layer_tests/tensor_iterator.cpp b/inference-engine/tests/functional/plugin/gpu/single_layer_tests/tensor_iterator.cpp index aae2e0db8fa3ac..bce1ef106917f2 100644 --- a/inference-engine/tests/functional/plugin/gpu/single_layer_tests/tensor_iterator.cpp +++ b/inference-engine/tests/functional/plugin/gpu/single_layer_tests/tensor_iterator.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include "common_test_utils/test_constants.hpp" #include "ie_api.h" @@ -289,8 +289,8 @@ namespace { InferenceEngine::Precision::FP16, }), // precision ::testing::ValuesIn(std::vector { - {CommonTestUtils::DEVICE_GPU, {{CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING, PluginConfigParams::YES}}}, - {CommonTestUtils::DEVICE_GPU, {{CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING, PluginConfigParams::NO}}} + {CommonTestUtils::DEVICE_GPU, {{GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING, PluginConfigParams::YES}}}, + {CommonTestUtils::DEVICE_GPU, {{GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING, PluginConfigParams::NO}}} })), // configuration TensorIteratorWithConfigTest::getTestCaseName); } // namespace diff --git a/tools/benchmark/main.py b/tools/benchmark/main.py index 29aff45742e8c1..26ef6246f0c9d5 100644 --- a/tools/benchmark/main.py +++ b/tools/benchmark/main.py @@ -152,7 +152,7 @@ def set_throughput_streams(): if MULTI_DEVICE_NAME in device_name and CPU_DEVICE_NAME in device_name: logger.warning("Turn on GPU trottling. Multi-device execution with the CPU + GPU performs best with GPU trottling hint, " + "which releases another CPU thread (that is otherwise used by the GPU driver for active polling)") - config[device]['CLDNN_PLUGIN_THROTTLE'] = '1' + config[device]['GPU_PLUGIN_THROTTLE'] = '1' elif device == MYRIAD_DEVICE_NAME: set_throughput_streams() config[device]['LOG_LEVEL'] = 'LOG_INFO'