diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/CMakeLists.txt b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/CMakeLists.txt index 62255b14..6d015b70 100644 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/CMakeLists.txt +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/CMakeLists.txt @@ -5,7 +5,7 @@ file(GLOB_RECURSE APPS_CPP_SRCS ${APPS_DIR}/*.cpp) idf_component_register( SRCS ${APPS_C_SRCS} ${APPS_CPP_SRCS} INCLUDE_DIRS ${APPS_DIR} - REQUIRES lvgl__lvgl esp_event esp_wifi nvs_flash esp_driver_jpeg esp_mm esp-brookesia bsp_extra esp32_p4_function_ev_board esp_video pedestrian_detect) + REQUIRES lvgl__lvgl esp_event esp_wifi nvs_flash esp_driver_jpeg esp_mm esp-brookesia bsp_extra esp32_p4_function_ev_board esp_video pedestrian_detect human_face_detect espressif__esp_lcd_touch_gt911) target_compile_options( ${COMPONENT_LIB} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/Camera.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/Camera.cpp index 64efe7d2..e89bd077 100644 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/Camera.cpp +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/Camera.cpp @@ -22,8 +22,11 @@ #include "bsp/esp-bsp.h" +#include "esp_lcd_touch_gt911.h" + #include "app_video.h" #include "app_pedestrian_detect.h" +#include "app_humanface_detect.h" #include "app_camera_pipeline.hpp" #include "Camera.hpp" #include "ui/ui.h" @@ -32,7 +35,7 @@ #define CAMERA_INIT_TASK_WAIT_MS (1000) #define DETECT_NUM_MAX (10) -#define FPS_PRINT (0) +#define FPS_PRINT (1) using namespace std; @@ -40,6 +43,7 @@ typedef enum { CAMERA_EVENT_TASK_RUN = BIT(0), CAMERA_EVENT_DELETE = BIT(1), CAMERA_EVENT_PED_DETECT = BIT(2), + CAMERA_EVENT_HUMAN_DETECT = BIT(3), } camera_event_id_t; LV_IMG_DECLARE(img_app_camera); @@ -48,10 +52,11 @@ static const char *TAG = "Camera"; // AI detection variables static void **detect_buf; -static int detect_num; -static int detect_bound[DETECT_NUM_MAX][4]; +static vector> detect_bound; +static vector> detect_keypoints; static std::list detect_results; static PedestrianDetect **ped_detect = NULL; +static HumanFaceDetect **hum_detect = NULL; static pipeline_handle_t feed_pipeline; static pipeline_handle_t detect_pipeline; @@ -100,6 +105,10 @@ bool Camera::run(void) ped_detect = get_pedestrian_detect(); *ped_detect = new PedestrianDetect(); assert(*ped_detect != NULL); + + hum_detect = get_humanface_detect(); + *hum_detect = new HumanFaceDetect(); + assert(*hum_detect != NULL); xTaskCreatePinnedToCore((TaskFunction_t)camera_dectect_task, "Camera Detect", 1024 * 8, this, 5, &_detect_task_handle, 1); @@ -163,12 +172,21 @@ bool Camera::run(void) if (xEventGroupGetBits(camera_event_group) & CAMERA_EVENT_PED_DETECT) { xEventGroupClearBits(camera_event_group, CAMERA_EVENT_PED_DETECT); + xEventGroupSetBits(camera_event_group, CAMERA_EVENT_HUMAN_DETECT); + lv_label_set_text(btn_label, " Face \n Detect"); + + lv_obj_add_flag(ui_ButtonCameraShotBtn, LV_OBJ_FLAG_HIDDEN); + lv_obj_add_flag(ui_PanelCameraShotControlBg, LV_OBJ_FLAG_HIDDEN); + lv_obj_add_flag(camera->_img_album, LV_OBJ_FLAG_HIDDEN); + camera->_screen_index = SCREEN_CAMERA_AI; + } else if (xEventGroupGetBits(camera_event_group) & CAMERA_EVENT_HUMAN_DETECT) { + xEventGroupClearBits(camera_event_group, CAMERA_EVENT_HUMAN_DETECT); lv_label_set_text(btn_label, " Normal \n Detect"); lv_obj_clear_flag(ui_ButtonCameraShotBtn, LV_OBJ_FLAG_HIDDEN); lv_obj_clear_flag(ui_PanelCameraShotControlBg, LV_OBJ_FLAG_HIDDEN); lv_obj_clear_flag(camera->_img_album, LV_OBJ_FLAG_HIDDEN); - camera->_screen_index = SCREEN_CAMERA_AI; + camera->_screen_index = SCREEN_CAMERA_SHOT; } else { xEventGroupSetBits(camera_event_group, CAMERA_EVENT_PED_DETECT); lv_label_set_text(btn_label, "Pedestrian \n Detect"); @@ -176,8 +194,9 @@ bool Camera::run(void) lv_obj_add_flag(ui_ButtonCameraShotBtn, LV_OBJ_FLAG_HIDDEN); lv_obj_add_flag(ui_PanelCameraShotControlBg, LV_OBJ_FLAG_HIDDEN); lv_obj_add_flag(camera->_img_album, LV_OBJ_FLAG_HIDDEN); - camera->_screen_index = SCREEN_CAMERA_SHOT; + camera->_screen_index = SCREEN_CAMERA_AI; } + }, LV_EVENT_CLICKED, this); return true; @@ -209,8 +228,14 @@ bool Camera::close(void) xEventGroupSetBits(camera_event_group, CAMERA_EVENT_TASK_RUN); xEventGroupSetBits(camera_event_group, CAMERA_EVENT_DELETE); xEventGroupClearBits(camera_event_group, CAMERA_EVENT_PED_DETECT); + xEventGroupClearBits(camera_event_group, CAMERA_EVENT_HUMAN_DETECT); app_video_stream_task_stop(_camera_ctlr_handle); + app_video_stream_wait_stop(); + + if (*hum_detect) { + delete *hum_detect; + } if (_img_album_buffer) { heap_caps_free(_img_album_buffer); @@ -226,11 +251,18 @@ bool Camera::init(void) xEventGroupClearBits(camera_event_group, CAMERA_EVENT_TASK_RUN); xEventGroupClearBits(camera_event_group, CAMERA_EVENT_DELETE); xEventGroupClearBits(camera_event_group, CAMERA_EVENT_PED_DETECT); + xEventGroupClearBits(camera_event_group, CAMERA_EVENT_HUMAN_DETECT); i2c_master_bus_handle_t i2c_bus_handle = bsp_i2c_get_handle(); esp_err_t ret = app_video_main(i2c_bus_handle); if (ret != ESP_OK) { ESP_LOGE(TAG, "video main init failed with error 0x%x", ret); + + if (ESP_OK == i2c_master_probe(i2c_bus_handle, ESP_LCD_TOUCH_IO_I2C_GT911_ADDRESS, 100) || ESP_OK == i2c_master_probe(i2c_bus_handle, ESP_LCD_TOUCH_IO_I2C_GT911_ADDRESS_BACKUP, 100)) { + ESP_LOGI(TAG, "gt911 touch found"); + } else { + ESP_LOGE(TAG, "Touch not found"); + } } // Open the video device @@ -418,6 +450,22 @@ static void camera_video_frame_operation(uint8_t *camera_buf, uint8_t camera_buf { xEventGroupWaitBits(camera_event_group, CAMERA_EVENT_TASK_RUN, pdFALSE, pdTRUE, portMAX_DELAY); + auto process_results = [&](const auto& results, bool process_keypoints) { + detect_keypoints.clear(); + detect_bound.clear(); + for (const auto& res : results) { + const auto& box = res.box; + if (box.size() >= 4 && std::any_of(box.begin(), box.end(), [](int v) { return v != 0; })) { + detect_bound.push_back(std::move(box)); + + if (process_keypoints && res.keypoint.size() >= 10 && + std::any_of(res.keypoint.begin(), res.keypoint.end(), [](int v) { return v != 0; })) { + detect_keypoints.push_back(std::move(res.keypoint)); + } + } + } + }; + if (xEventGroupGetBits(camera_event_group) & CAMERA_EVENT_PED_DETECT) { camera_pipeline_buffer_element *p = camera_pipeline_get_queued_element(feed_pipeline); if (p) { @@ -459,33 +507,40 @@ static void camera_video_frame_operation(uint8_t *camera_buf, uint8_t camera_buf camera_pipeline_buffer_element *detect_element = camera_pipeline_recv_element(detect_pipeline, 0); if (detect_element) { - detect_num = 0; - for(const auto &res : *(detect_element->detect_results)) { - if (res.box.size() >= 4) { - if (std::any_of(res.box.begin(), res.box.end(), [](int v){ return v != 0; })) { - detect_bound[detect_num][0] = res.box[0]; - detect_bound[detect_num][1] = res.box[1]; - detect_bound[detect_num][2] = res.box[2]; - detect_bound[detect_num][3] = res.box[3]; - detect_num++; - } - } - } + process_results(*(detect_element->detect_results), false); + camera_pipeline_queue_element_index(detect_pipeline, detect_element->index); } - for (int i = 0; i < detect_num; i++) { - if (std::any_of(detect_bound[i], detect_bound[i] + 4, [](int v){ return v != 0; })) { + for (int i = 0; i < detect_bound.size(); i++) { + if (detect_bound[i].size() >= 4 && std::any_of(detect_bound[i].begin(), detect_bound[i].end(), [](int v) { return v != 0; })) { draw_rectangle_rgb((uint16_t *)camera_buf, camera_buf_hes, camera_buf_ves, - detect_bound[i][0], detect_bound[i][1], detect_bound[i][2], detect_bound[i][3], - 0, 0, 255, 0, 0, 3); + detect_bound[i][0], detect_bound[i][1], detect_bound[i][2], detect_bound[i][3], + 0, 0, 255, 0, 0, 3); + } + } + } else if (xEventGroupGetBits(camera_event_group) & CAMERA_EVENT_HUMAN_DETECT) { + detect_results = app_humanface_detect((uint16_t *)camera_buf, camera_buf_ves, camera_buf_hes); + + process_results(detect_results, true); + + for (int i = 0; i < detect_keypoints.size(); i++) { + if (detect_bound[i].size() >= 4 && std::any_of(detect_bound[i].begin(), detect_bound[i].end(), [](int v) { return v != 0; })) { + draw_rectangle_rgb((uint16_t *)camera_buf, camera_buf_hes, camera_buf_ves, + detect_bound[i][0], detect_bound[i][1], detect_bound[i][2], detect_bound[i][3], + 0, 0, 255, 0, 0, 3); + + if (detect_keypoints[i].size() >= 10) { + draw_green_points((uint16_t *)camera_buf, detect_keypoints[i]); + } } } } - if (!(xEventGroupGetBits(camera_event_group) & CAMERA_EVENT_DELETE)) { - bsp_display_lock(0); - lv_canvas_set_buffer(ui_ImageCameraShotImage, camera_buf, camera_buf_hes, camera_buf_ves, LV_IMG_CF_TRUE_COLOR); + if (!(xEventGroupGetBits(camera_event_group) & CAMERA_EVENT_DELETE) && bsp_display_lock(100)) { + if(ui_ImageCameraShotImage) { + lv_canvas_set_buffer(ui_ImageCameraShotImage, camera_buf, camera_buf_hes, camera_buf_ves, LV_IMG_CF_TRUE_COLOR); + } lv_refr_now(NULL); bsp_display_unlock(); } diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_humanface_detect.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_humanface_detect.cpp new file mode 100644 index 00000000..cd147e3a --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_humanface_detect.cpp @@ -0,0 +1,25 @@ +/* + * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "esp_log.h" +#include "iostream" +#include "human_face_detect.hpp" +#include "dl_tool.hpp" +#include "app_humanface_detect.h" + +static HumanFaceDetect *detect = NULL; + +std::list app_humanface_detect(uint16_t *frame, int width, int height) +{ + auto detect_results = detect->run(frame, {width, height, 3}); + + return detect_results; +} + +HumanFaceDetect **get_humanface_detect() +{ + return &detect; +} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_humanface_detect.h b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_humanface_detect.h new file mode 100644 index 00000000..806dad22 --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_humanface_detect.h @@ -0,0 +1,20 @@ +/* + * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include "human_face_detect.hpp" + +std::list app_humanface_detect(uint16_t *frame, int width, int height); + +#ifdef __cplusplus +extern "C" { +#endif + +HumanFaceDetect **get_humanface_detect(); + +#ifdef __cplusplus +} +#endif diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_video.c b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_video.c index 26bb1040..4403657c 100644 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_video.c +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_video.c @@ -25,6 +25,11 @@ static const char *TAG = "app_video"; #define VIDEO_TASK_STACK_SIZE (4 * 1024) #define VIDEO_TASK_PRIORITY (4) +typedef enum { + VIDEO_TASK_DELETE = BIT(0), + VIDEO_TASK_DELETE_DONE = BIT(1), +} video_event_id_t; + typedef struct { uint8_t *camera_buffer[MAX_BUFFER_COUNT]; size_t camera_buf_size; @@ -34,7 +39,7 @@ typedef struct { uint8_t camera_mem_mode; app_video_frame_operation_cb_t user_camera_video_frame_operation_cb; TaskHandle_t video_stream_task_handle; - bool video_task_delete; + EventGroupHandle_t video_event_group; } app_video_t; static app_video_t app_camera_video; @@ -330,6 +335,8 @@ static inline esp_err_t video_stream_stop(int video_fd) goto errout; } + xEventGroupSetBits(app_camera_video.video_event_group, VIDEO_TASK_DELETE_DONE); + return ESP_OK; errout: @@ -347,8 +354,8 @@ static void video_stream_task(void *arg) ESP_ERROR_CHECK(video_free_video_frame(video_fd)); - if(app_camera_video.video_task_delete) { - app_camera_video.video_task_delete = false; + if(xEventGroupGetBits(app_camera_video.video_event_group) & VIDEO_TASK_DELETE) { + xEventGroupClearBits(app_camera_video.video_event_group, VIDEO_TASK_DELETE); ESP_ERROR_CHECK(video_stream_stop(video_fd)); vTaskDelete(NULL); } @@ -358,6 +365,11 @@ static void video_stream_task(void *arg) esp_err_t app_video_stream_task_start(int video_fd, int core_id) { + if(app_camera_video.video_event_group == NULL) { + app_camera_video.video_event_group = xEventGroupCreate(); + } + xEventGroupClearBits(app_camera_video.video_event_group, VIDEO_TASK_DELETE_DONE); + video_stream_start(video_fd); BaseType_t result = xTaskCreatePinnedToCore(video_stream_task, "video stream task", VIDEO_TASK_STACK_SIZE, &video_fd, VIDEO_TASK_PRIORITY, &app_camera_video.video_stream_task_handle, core_id); @@ -376,7 +388,7 @@ esp_err_t app_video_stream_task_start(int video_fd, int core_id) esp_err_t app_video_stream_task_stop(int video_fd) { - app_camera_video.video_task_delete = true; + xEventGroupSetBits(app_camera_video.video_event_group, VIDEO_TASK_DELETE); return ESP_OK; } @@ -387,3 +399,12 @@ esp_err_t app_video_register_frame_operation_cb(app_video_frame_operation_cb_t o return ESP_OK; } + +esp_err_t app_video_stream_wait_stop(void) +{ + xEventGroupWaitBits(app_camera_video.video_event_group, VIDEO_TASK_DELETE_DONE, pdTRUE, pdTRUE, portMAX_DELAY); + + ESP_LOGI(TAG, "Video Stream Task Stopped Done"); + + return ESP_OK; +} \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_video.h b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_video.h index cc24ff4d..095650a9 100644 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_video.h +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/apps/camera/app_video.h @@ -9,6 +9,7 @@ #include "esp_err.h" #include "linux/videodev2.h" #include "esp_video_device.h" +#include "driver/i2c_master.h" #ifdef __cplusplus extern "C" { @@ -134,6 +135,15 @@ esp_err_t app_video_stream_task_stop(int video_fd); */ esp_err_t app_video_register_frame_operation_cb(app_video_frame_operation_cb_t operation_cb); +/** + * @brief Wait for the video stream to stop. + * + * Blocks the current task until the video stream task has stopped. + * + * @return ESP_OK on success. + */ +esp_err_t app_video_stream_wait_stop(void); + #ifdef __cplusplus } #endif diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/CMakeLists.txt b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/CMakeLists.txt deleted file mode 100644 index 2e60948e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/CMakeLists.txt +++ /dev/null @@ -1,74 +0,0 @@ -idf_build_get_property(target IDF_TARGET) - -set(src_dirs ./dl/tool/src - ./dl/typedef/src - ./dl/base - ./dl/math/src - ./dl/model/src - ./dl/module/src - ./fbs_loader/src - ./vision/detect - ./vision/image - ./vision/recognition - ) - -set(include_dirs ./dl - ./dl/tool/include - ./dl/typedef/include - ./dl/base - ./dl/base/isa - ./dl/math/include - ./dl/model/include - ./dl/module/include - ./dl/lut/ - ./fbs_loader/include - ./vision/detect - ./vision/image - ./vision/recognition - ) - -if(CONFIG_IDF_TARGET_ESP32) - list(APPEND src_dirs dl/tool/isa/xtensa) - list(APPEND src_dirs dl/base/isa/xtensa) - -elseif(CONFIG_IDF_TARGET_ESP32S2) - -elseif(CONFIG_IDF_TARGET_ESP32S3) - list(APPEND src_dirs dl/tool/isa/xtensa) - list(APPEND src_dirs dl/tool/isa/tie728) - list(APPEND src_dirs dl/base/isa/xtensa) - list(APPEND src_dirs dl/base/isa/tie728) - -elseif(CONFIG_IDF_TARGET_ESP32C3) - -elseif(CONFIG_IDF_TARGET_ESP32P4) - list(APPEND src_dirs dl/tool/isa/esp32p4) - list(APPEND src_dirs dl/base/isa/esp32p4) -endif() - -set(requires esp_mm - esp_driver_ppa - esp_partition - esp_timer - mbedtls - spi_flash) - -idf_component_register(SRCS ${srcs} SRC_DIRS ${src_dirs} INCLUDE_DIRS ${include_dirs} REQUIRES ${requires}) - -if(CONFIG_IDF_TARGET_ESP32P4) - component_compile_options(-ffast-math -O3 -Wno-error=format=-Wno-format) - add_prebuilt_library(fbs_model "fbs_loader/lib/esp32p4/libfbs_model.a") - target_link_libraries(${COMPONENT_LIB} PRIVATE fbs_model) -elseif(CONFIG_IDF_TARGET_ESP32S3) - component_compile_options(-ffast-math -O3 -Wno-error=format=-Wno-format) - add_prebuilt_library(fbs_model "fbs_loader/lib/esp32s3/libfbs_model.a") - target_link_libraries(${COMPONENT_LIB} PRIVATE fbs_model) -else() - component_compile_options(-ffast-math -O2 -Wno-error=format=-Wno-format) -endif() - -# component_compile_options(-ffast-math -frtti -O2 -Wno-error=format=-Wno-format) -target_compile_options(${COMPONENT_LIB} PRIVATE -Wno-array-bounds - -Wno-deprecated-copy - -Wno-strict-aliasing - -Wno-overloaded-virtual) diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/README.md b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/README.md deleted file mode 100644 index 7aeafb5e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# ESP-DL - -ESP-DL is designed to maintain optimal performance while significantly reducing the workload in model deployment. Our project has achieved the following key features: - -### ESP-DL Standard Model Format - -The ESP-DL standard model format is a binary format used to store the model graph, weights, and other essential information, with a file extension of `.espdl`. This format is similar to the ONNX model format but replaces ONNX's Protobuf with FlatBuffers, making our models more lightweight and supporting zero-copy deserialization. This feature ensures faster data access by eliminating the need to copy serialized data into separate memory areas. - -### [esp-ppq](https://github.com/espressif/esp-ppq) - -ESP-PPQ is a model quantization tool developed based on the open-source project PPQ. Users can select the ESP-DL target platform and directly export ESP-DL standard model files. ESP-PPQ inherits all the functionalities and documentation from the PPQ project, allowing users to conveniently choose quantization algorithms and analyze quantization errors. - -### Efficient Operator Implementation - -We have efficiently implemented common AI operators, including Conv2d, Pool2D, Gemm, Add, Mul, etc., based on AI instructions. These operators are precisely aligned with the PyTorch operator implementation, ensuring that the results obtained from the esp-ppq tool are consistent with those running on ESP-DL. - -### Static Memory Planner - -A new static memory planner is designed for the Internal RAM/PSRAM memory structure. Considering that internal RAM has faster access speed but limited capacity, we provide an API that allows users to customize the size of the internal RAM that the model can use. The memory planner will automatically allocate different layers to the optimal memory location based on the size of the internal RAM specified by the user, ensuring that the overall running speed is more efficient while occupying the minimum amount of memory. - -### Dual Core Scheduling - -The automatic dual-core scheduling enables computationally intensive operators to fully utilize the computing power of dual-cores. Currently, Conv2D and DepthwiseConv2D support dual-core scheduling. Below are some of our experimental results: - -| |conv2d(input=224X224X3, kernel=3x3, output=112x112x16)| -|:---:|:---:| -|single core| 12.1.ms| -|dual core| 6.2 ms| - ---- - -Explore ESP-DL to streamline your AI model deployment and achieve optimal performance with minimal resource usage. \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base.hpp deleted file mode 100644 index 9d76a62c..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base.hpp +++ /dev/null @@ -1,2713 +0,0 @@ -#pragma once - -#include -#include -#include - -#include "dl_constant.hpp" -#include "dl_variable.hpp" - -namespace dl { -namespace base { -template -struct ArgsType { - feature_t *input_element; /*> 1) - 1 */ - int c_rs2_1; /*> 2) - 1 */ - int n_div_x; /*> 1) - 1 */ - int16_t *filter_channel_factor; /* &); -typedef void (*n_wise_func_s16_t)(int16_t *, DL_S16_BUFFER_TYPE *, const ArgsType &); - -typedef void (*i_impl_func_s8_t)(int8_t *, int8_t *, void *); -typedef void (*c_impl_func_s8_t)(int32_t *, int8_t *, const ArgsType &); -typedef void (*n_wise_func_s8_t)(int8_t *, int32_t *, const ArgsType &); - -// TODO:剥离出多核时 input output 的指针分配 -template -void load_input_output_ptr() -{ -} - -template -std::vector> get_conv_operation_args(Tensor &output, - Tensor &input, - std::vector &padding, - const Filter &filter, - const int stride_y, - const int stride_x, - const Bias *bias = NULL, - const Activation *activate = NULL, - const bool auto_split = true, - const int core_number = 1, - bool malloc_debug_memory = false) -{ - ArgsType args; - args.input_element = input.get_element_ptr(); // TODO: auto_split - args.input_channel = input.shape[2]; - args.input_stride_y_offset = input.shape[1] * input.shape[2] * stride_y; - args.input_stride_x_offset = input.shape[2] * stride_x; - args.input_dilation_y_offset = input.shape[1] * input.shape[2] * filter.dilation[0]; - args.input_dilation_x_offset = input.shape[2] * filter.dilation[1]; - - args.output_element = output.get_element_ptr(); // TODO: auto_split - args.output_height = output.shape[0]; - args.output_width = output.shape[1]; - args.output_channel = output.shape[2]; - args.output_y_offset = output.shape[1] * output.shape[2]; - args.output_x_offset = output.shape[2]; - - args.filter_element = filter.element; // TODO: auto_split - args.filter_height = filter.shape[0]; - args.filter_width = filter.shape[1]; - args.filter_y_offset = 0; - args.filter_n_offset = 0; - - args.filter_y_offset_c = filter.shape[1] * filter.shape[2]; - args.filter_n_offset_c = args.filter_y_offset_c * filter.shape[0]; - - args.padding_h_head = padding[0]; - args.padding_h_tail = padding[1]; - args.padding_w_head = padding[2]; - args.padding_w_tail = padding[3]; - args.dilation_h = filter.dilation[0]; - args.dilation_w = filter.dilation[1]; - args.stride_x = stride_x; - args.stride_y = stride_y; - args.input_y_offset = input.shape[1] * input.shape[2]; - args.filter_c = filter.shape[2]; // dw: filter.shape[3]. conv: filter.shape[2]. - args.input_channel_with_padding = input.shape[2]; - args.input_height = input.shape[0]; - args.input_width = input.shape[1]; - args.auto_split = auto_split; - // printf("input: %d, %d, %d, output: %d, %d, %d\n", input.shape[0], input.shape[1], input.shape[2], - // output.shape[0], output.shape[1], output.shape[2]); - - if (filter.exponent == INT_MIN && sizeof(feature_t) == 1) { // S8 per-channel quantization - args.mac_shift = INT_MIN; - - // calculate scale using filter.channel_exponent - args.tie_filter_channel_factor = (int16_t *)tool::malloc_aligned( - filter.channel_exponent_size, sizeof(int16_t), 16, MALLOC_CAP_8BIT); // TODO: auto_split - int u = 16 / sizeof(feature_t); - int len = filter.channel_exponent_size / u * u; - - for (int i = 0; i < len; i++) { // special operation for qacc due to cannot shift different per-channel - int tmp = output.exponent - filter.channel_exponent[i] - input.exponent; - args.tie_filter_channel_factor[i] = (int16_t)1 << (15 - tmp); - } - for (int i = len; i < filter.channel_exponent_size; i++) { // for conv2d n remainder accx - args.tie_filter_channel_factor[i] = output.exponent - filter.channel_exponent[i] - input.exponent; - } - - args.filter_channel_factor = - (int16_t *)tool::malloc_aligned(filter.channel_exponent_size, sizeof(int16_t), 16, MALLOC_CAP_8BIT); - for (int i = 0; i < filter.channel_exponent_size; i++) { - args.filter_channel_factor[i] = output.exponent - filter.channel_exponent[i] - input.exponent; - } - } else { // per-layer quantization - args.mac_shift = output.exponent - filter.exponent - input.exponent; - } - - args.bias_element = bias ? bias->element : NULL; // TODO: auto_split - args.activation_type = activate ? activate->type : Linear; - - switch (args.activation_type) { - case ReLU: - args.activation_alpha = 0; - args.activation_shift = 0; - args.activation_alpha_ptr = NULL; - break; - case LeakyReLU: - args.activation_alpha = activate->element[0]; - args.activation_shift = -activate->exponent; - args.activation_alpha_ptr = NULL; - break; - case PReLU: - args.activation_alpha_ptr = activate->element; // TODO: auto_split - args.activation_shift = -activate->exponent; - break; - default: - args.activation_alpha_ptr = NULL; - args.activation_shift = -1; - break; - } - - // for ISA - args.c_rs1_1 = (input.shape[2] >> 1) - 1; - args.c_rs2_1 = (input.shape[2] >> 2) - 1; - int u = 16 / sizeof(feature_t); - args.n_div_x = output.shape[2] / u; // TODO: auto_split - args.c_div_x_1 = input.shape[2] / u - 1; - - args.c_remainder = args.input_channel % u * sizeof(feature_t); - args.n_remainder = args.output_channel % u; - - args.xtensa_dilation_x_offset = (filter.dilation[1] * input.shape[2] - input.shape[2]) * sizeof(feature_t); - args.xtensa_dilation_y_offset_stable = filter.dilation[0] * input.shape[2] * input.shape[1]; - args.xtensa_dilation_y_offset = (args.xtensa_dilation_y_offset_stable - input.shape[2] - - (filter.shape[1] - 1) * filter.dilation[1] * input.shape[2]) * - sizeof(feature_t); - - args.filter_y_offset_unaligned = 0; - args.filter_n_offset_unaligned = 0; - args.filter_element_unaligned = args.n_remainder - ? (filter.element + args.n_div_x * args.filter_height * args.filter_width * args.filter_c * u) - : filter.element; - - args.debug_value = nullptr; - if (malloc_debug_memory) { - args.debug_value = tool::calloc_aligned(16, sizeof(int8_t), 16, MALLOC_CAP_8BIT); - } - - // slice - std::vector> m_args(core_number, args); - if (core_number > 1) { - int output_y_slice = output.shape[0] / core_number; - int output_y_remained = output.shape[0]; - - // first slice - m_args[0].output_height = output_y_slice; - output_y_remained -= output_y_slice; - - // between slice - for (size_t i = 1; i < core_number - 1; i++) { - m_args[i].input_element = - m_args[i - 1].input_element + m_args[i - 1].output_height * args.input_stride_y_offset; - m_args[i].output_element = - m_args[i - 1].output_element + m_args[i - 1].output_height * args.output_y_offset; - m_args[i].output_height = output_y_slice; - output_y_remained -= output_y_slice; - } - - // last slice - m_args.back().input_element = - m_args[core_number - 2].input_element + m_args[core_number - 2].output_height * args.input_stride_y_offset; - m_args.back().output_element = - m_args[core_number - 2].output_element + m_args[core_number - 2].output_height * args.output_y_offset; - m_args.back().output_height = output_y_remained; - } - - return m_args; -} - -// Modifications: -// 1. Tensor, Filter, Bias, Activation -> TensorBase pointer -// 2. move dilations from Filter into function's argument -// 3. activation_alpha is used for PReLU and leaky ReLU -// TODO:: It is possible to remove the template for ArgsType -template -std::vector> get_conv_operation_args(TensorBase *output, - TensorBase *input, - std::vector &padding, - TensorBase *filter, - const int stride_y, - const int stride_x, - const int dilation_y, - const int dilation_x, - const int group, - TensorBase *bias = NULL, - const activation_type_t activate = Linear, - TensorBase *activation_alpha = nullptr, - const runtime_mode_t runtime_mode = RUNTIME_MODE_AUTO, - bool malloc_debug_memory = false) -{ - ArgsType args; - args.input_element = (feature_t *)input->get_element_ptr(); // TODO: auto_split - args.input_channel = input->shape[3]; - args.input_stride_y_offset = input->shape[2] * input->shape[3] * stride_y; - args.input_stride_x_offset = input->shape[3] * stride_x; - args.input_dilation_y_offset = input->shape[2] * input->shape[3] * dilation_y; - args.input_dilation_x_offset = input->shape[3] * dilation_x; - - args.output_element = (feature_t *)output->get_element_ptr(); // TODO: auto_split - args.output_height = output->shape[1]; - args.output_width = output->shape[2]; - args.output_channel = output->shape[3]; - args.output_y_offset = output->shape[2] * output->shape[3]; - args.output_x_offset = output->shape[3]; - - args.filter_element = filter->get_element_ptr(); // TODO: auto_split - args.filter_height = filter->shape[0]; - args.filter_width = filter->shape[1]; - if (group == 1) { - // conv - args.filter_y_offset = 0; - args.filter_c = filter->shape[2]; // dw: filter->shape[3]. conv: filter->shape[2]. - } else { - // depthwise - args.filter_y_offset = 16; - args.filter_c = filter->shape[3]; // dw: filter->shape[3]. conv: filter->shape[2]. - } - args.filter_n_offset = 0; - args.filter_y_offset_c = filter->shape[1] * filter->shape[2]; - args.filter_n_offset_c = args.filter_y_offset_c * filter->shape[0]; - - args.padding_h_head = padding[0]; - args.padding_h_tail = padding[1]; - args.padding_w_head = padding[2]; - args.padding_w_tail = padding[3]; - args.dilation_h = dilation_y; - args.dilation_w = dilation_x; - args.stride_x = stride_x; - args.stride_y = stride_y; - args.input_y_offset = input->shape[2] * input->shape[3]; - args.input_channel_with_padding = input->shape[3]; - args.input_height = input->shape[1]; - args.input_width = input->shape[2]; - args.auto_split = true; - // printf("input: %d, %d, %d, output: %d, %d, %d\n", input->shape[1], input->shape[2], input->shape[3], - // output->shape[1], output->shape[2], output->shape[3]); - - args.mac_shift = output->exponent - filter->exponent - input->exponent; - - args.bias_element = bias ? bias->get_element_ptr() : NULL; // TODO: auto_split - args.activation_type = activate; - - switch (args.activation_type) { - case ReLU: - args.activation_alpha = 0; - args.activation_shift = 0; - args.activation_alpha_ptr = NULL; - break; - case LeakyReLU: - // ESP_LOGE(__FUNCTION__, "Do not support Leaky ReLU"); - // args.activation_alpha = activation_alpha->get_element_ptr()[0]; - // args.activation_shift = -activation_alpha->exponent; - // args.activation_alpha_ptr = NULL; - break; - case PReLU: - // ESP_LOGE(__FUNCTION__, "Do not support PReLU"); - // args.activation_alpha_ptr = activation_alpha->get_element_ptr(); //TODO: auto_split - // args.activation_shift = -activation_alpha->exponent; - break; - default: - args.activation_alpha_ptr = NULL; - args.activation_shift = -1; - break; - } - - // for ISA - args.c_rs1_1 = (input->shape[3] >> 1) - 1; - args.c_rs2_1 = (input->shape[3] >> 2) - 1; - int u = 16 / sizeof(feature_t); - args.n_div_x = output->shape[3] / u; // TODO: auto_split - args.c_div_x_1 = input->shape[3] / u - 1; - - args.c_remainder = args.input_channel % u * sizeof(feature_t); - args.n_remainder = args.output_channel % u; - - args.xtensa_dilation_x_offset = (dilation_x * input->shape[3] - input->shape[3]) * sizeof(feature_t); - args.xtensa_dilation_y_offset_stable = dilation_y * input->shape[3] * input->shape[2]; - args.xtensa_dilation_y_offset = (args.xtensa_dilation_y_offset_stable - input->shape[3] - - (filter->shape[1] - 1) * dilation_x * input->shape[3]) * - sizeof(feature_t); - - args.filter_y_offset_unaligned = 0; - args.filter_n_offset_unaligned = 0; - args.filter_element_unaligned = args.n_remainder - ? ((feature_t *)args.filter_element + args.n_div_x * args.filter_height * args.filter_width * args.filter_c * u) - : args.filter_element; - - if (group > 1) { - args.filter_w_rs1_1 = (filter->shape[1] >> 1) - 1; - args.tie_depth2d_dilation_x_offset = dilation_x * input->shape[3] * sizeof(feature_t); - args.tie_depth2d_dilation_y_offset_stable = dilation_y * input->shape[3] * input->shape[2]; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - (filter->shape[1] - 1) * dilation_x * input->shape[3]) * - sizeof(feature_t); - - args.tie_depth2d_next_hwx1 = - (filter->shape[1] - 1) * dilation_x + (filter->shape[0] - 1) * dilation_y * input->shape[2]; - args.tie_depth2d_next_hwx1 = 16 - args.tie_depth2d_next_hwx1 * input->shape[3] * sizeof(feature_t); - } - args.debug_value = nullptr; - if (malloc_debug_memory) { - args.debug_value = tool::calloc_aligned(16, sizeof(int8_t), 16, MALLOC_CAP_8BIT); - } - std::vector> m_args(1, args); - if (args.input_height > 4 * args.dilation_h * args.filter_height) { - if (runtime_mode == RUNTIME_MODE_MULTI_CORE || - (runtime_mode == RUNTIME_MODE_AUTO && args.input_height >= 100 && args.input_width >= 50)) { - m_args.push_back(args); - - // Divide this convolution into two tasks by splitting the input height. - // up - int dilation_filter_height = args.dilation_h * (args.filter_height - 1) + 1; - int half_step = (args.padding_h_head + args.padding_h_tail + args.input_height - dilation_filter_height) / - args.stride_y / 2; - m_args[0].input_height = dilation_filter_height - args.padding_h_head + half_step * args.stride_y; - m_args[0].padding_h_tail = 0; - m_args[0].output_height = half_step + 1; - // bottom - m_args[1].padding_h_head = 0; - m_args[1].input_height = - dilation_filter_height - args.stride_y + args.input_height - m_args[0].input_height; - m_args[1].input_element += - (args.input_height - m_args[1].input_height) * args.input_width * args.input_channel; - m_args[1].output_height = args.output_height - m_args[0].output_height; - m_args[1].output_element += - (args.output_height - m_args[1].output_height) * args.output_width * args.output_channel; - } - } - - return m_args; -} - -template -void conv_operation_shell(ArgsType &args, - void (*i_impl_func)(feature_t *, feature_t *, void *), - void (*i_impl_func_sp)(feature_t *, feature_t *, void *), - void (*c_impl_func)(buffer_t *, feature_t *, const ArgsType &), - void (*c_impl_func_sp)(buffer_t *, feature_t *, const ArgsType &), - void (*n_wise_tail)(feature_t *, buffer_t *, const ArgsType &)) -{ - feature_t *input_ptr = (feature_t *)args.input_element; - feature_t *output_ptr = (feature_t *)args.output_element; - if (args.padding_h_head || args.padding_w_head || args.padding_h_tail || args.padding_w_tail) { // padding same - int n_h_head = (args.padding_h_head + args.stride_y - 1) / args.stride_y; - int n_w_head = (args.padding_w_head + args.stride_x - 1) / args.stride_x; - int n_h_body = ((args.input_height + args.padding_h_head - args.dilation_h * (args.filter_height - 1) - 1) / - args.stride_y + - 1) - - n_h_head; - int n_w_body = - ((args.input_width + args.padding_w_head - args.dilation_w * (args.filter_width - 1) - 1) / args.stride_x + - 1) - - n_w_head; - int n_h_tail = args.output_height - n_h_head - n_h_body; - int n_w_tail = args.output_width - n_w_head - n_w_body; - int filter_h = args.filter_height; - int filter_w = args.filter_width; - feature_t *filter_ptr = (feature_t *)(args.filter_element); - - if (i_impl_func_sp) { - feature_t *input_y_real; - feature_t *input_x_real; - feature_t *filter_ptr_y; - feature_t *output_yx = output_ptr; - feature_t *filter_ptr_unaligned = (feature_t *)(args.filter_element_unaligned); - feature_t *filter_ptr_y_unaligned; - int unaligned_filter_c_n_offset = args.filter_c * sizeof(feature_t); -#if CONFIG_TIE728_BOOST || CONFIG_ESP32P4_BOOST - int filter_c_n_offset = args.n_div_x ? args.filter_c * 16 : unaligned_filter_c_n_offset; -#else - int filter_c_n_offset = unaligned_filter_c_n_offset; -#endif - int filter_c_n_ptr_offset = filter_c_n_offset / sizeof(feature_t); - - if (n_wise_tail) { - for (size_t output_y = 0; output_y < n_h_head; output_y++) { - args.filter_height = filter_h - - ((args.padding_h_head - output_y * args.stride_y) + args.dilation_h - 1) / args.dilation_h; - input_y_real = input_ptr + - args.input_y_offset * - ((args.stride_y * output_y + (filter_h - args.filter_height) * args.dilation_h) - - args.padding_h_head); - filter_ptr_y = filter_ptr + (filter_h - args.filter_height) * filter_w * filter_c_n_ptr_offset; - filter_ptr_y_unaligned = - filter_ptr_unaligned + (filter_h - args.filter_height) * filter_w * args.filter_c; - args.filter_n_offset = (filter_w * (filter_h - args.filter_height)) * filter_c_n_offset; - args.filter_n_offset_unaligned = - (filter_w * (filter_h - args.filter_height)) * unaligned_filter_c_n_offset; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - args.filter_element_unaligned = - filter_ptr_y_unaligned + (filter_w - args.filter_width) * args.filter_c; - - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.filter_y_offset = 0; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = 0; - args.filter_element = filter_ptr_y; - args.filter_element_unaligned = filter_ptr_y_unaligned; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - } - - args.filter_height = filter_h; - input_y_real = input_ptr + args.input_y_offset * ((args.stride_y * n_h_head) - args.padding_h_head); - filter_ptr_y = filter_ptr; - filter_ptr_y_unaligned = filter_ptr_unaligned; - args.filter_n_offset = 0; - args.filter_n_offset_unaligned = 0; - for (size_t output_y = 0; output_y < n_h_body; output_y++) { - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - args.filter_element_unaligned = - filter_ptr_y_unaligned + (filter_w - args.filter_width) * args.filter_c; - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.filter_y_offset = 0; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = 0; - args.filter_element = filter_ptr_y; - args.filter_element_unaligned = filter_ptr_y_unaligned; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func_sp(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - input_y_real += args.input_stride_y_offset; - } - - for (size_t output_y = 0; output_y < n_h_tail; output_y++) { - args.filter_height = (args.padding_h_head + args.input_height - - (n_h_head + n_h_body + output_y) * args.stride_y + args.dilation_h - 1) / - args.dilation_h; - args.filter_n_offset = (filter_w * (filter_h - args.filter_height)) * filter_c_n_offset; - args.filter_n_offset_unaligned = - (filter_w * (filter_h - args.filter_height)) * unaligned_filter_c_n_offset; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - args.filter_element_unaligned = - filter_ptr_y_unaligned + (filter_w - args.filter_width) * args.filter_c; - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_y_offset = 0; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = 0; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.filter_element = filter_ptr_y; - args.filter_element_unaligned = filter_ptr_y_unaligned; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - input_y_real += args.input_stride_y_offset; - } - } else { - for (size_t output_y = 0; output_y < n_h_head; output_y++) { - args.filter_height = filter_h - - ((args.padding_h_head - output_y * args.stride_y) + args.dilation_h - 1) / args.dilation_h; - input_y_real = input_ptr + - args.input_y_offset * - ((args.stride_y * output_y + (filter_h - args.filter_height) * args.dilation_h) - - args.padding_h_head); - filter_ptr_y = filter_ptr + (filter_h - args.filter_height) * filter_w * filter_c_n_ptr_offset; - filter_ptr_y_unaligned = - filter_ptr_unaligned + (filter_h - args.filter_height) * filter_w * args.filter_c; - args.filter_n_offset = (filter_w * (filter_h - args.filter_height)) * filter_c_n_offset; - args.filter_n_offset_unaligned = - (filter_w * (filter_h - args.filter_height)) * unaligned_filter_c_n_offset; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - args.filter_element_unaligned = - filter_ptr_y_unaligned + (filter_w - args.filter_width) * args.filter_c; - - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.filter_y_offset = 0; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = 0; - args.filter_element = filter_ptr_y; - args.filter_element_unaligned = filter_ptr_y_unaligned; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - } - - args.filter_height = filter_h; - input_y_real = input_ptr + args.input_y_offset * ((args.stride_y * n_h_head) - args.padding_h_head); - filter_ptr_y = filter_ptr; - filter_ptr_y_unaligned = filter_ptr_unaligned; - args.filter_n_offset = 0; - args.filter_n_offset_unaligned = 0; - - for (size_t output_y = 0; output_y < n_h_body; output_y++) { - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - args.filter_element_unaligned = - filter_ptr_y_unaligned + (filter_w - args.filter_width) * args.filter_c; - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.filter_y_offset = 0; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = 0; - args.filter_element = filter_ptr_y; - args.filter_element_unaligned = filter_ptr_y_unaligned; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func_sp(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - input_y_real += args.input_stride_y_offset; - } - - for (size_t output_y = 0; output_y < n_h_tail; output_y++) { - args.filter_height = (args.padding_h_head + args.input_height - - (n_h_head + n_h_body + output_y) * args.stride_y + args.dilation_h - 1) / - args.dilation_h; - args.filter_n_offset = (filter_w * (filter_h - args.filter_height)) * filter_c_n_offset; - args.filter_n_offset_unaligned = - (filter_w * (filter_h - args.filter_height)) * unaligned_filter_c_n_offset; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - args.filter_element_unaligned = - filter_ptr_y_unaligned + (filter_w - args.filter_width) * args.filter_c; - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_y_offset = 0; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = 0; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.filter_element = filter_ptr_y; - args.filter_element_unaligned = filter_ptr_y_unaligned; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.xtensa_dilation_y_offset = - (args.xtensa_dilation_y_offset_stable - args.input_channel - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - input_y_real += args.input_stride_y_offset; - } - } - } else // run c_impl_func - { - buffer_t *buffer = - (buffer_t *)tool::calloc_aligned(args.output_channel, sizeof(buffer_t), 16, MALLOC_CAP_8BIT); - feature_t *input_y_real; - feature_t *input_x_real; - feature_t *filter_ptr_y; - feature_t *output_yx = output_ptr; - int filter_c_n_offset = args.output_channel; - int filter_c_n_ptr_offset = filter_c_n_offset; - - for (size_t output_y = 0; output_y < n_h_head; output_y++) { - args.filter_height = filter_h - - ((args.padding_h_head - output_y * args.stride_y) + args.dilation_h - 1) / args.dilation_h; - input_y_real = input_ptr + - args.input_y_offset * - ((args.stride_y * output_y + (filter_h - args.filter_height) * args.dilation_h) - - args.padding_h_head); - filter_ptr_y = filter_ptr + (filter_h - args.filter_height) * filter_w * filter_c_n_ptr_offset; - args.filter_n_offset = (filter_w * (filter_h - args.filter_height)) * filter_c_n_offset; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_y_offset = 0; // ??? c, xtensa, tie 顺序不同 - args.filter_element = filter_ptr_y; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - } - - args.filter_height = filter_h; - input_y_real = input_ptr + args.input_y_offset * ((args.stride_y * n_h_head) - args.padding_h_head); - filter_ptr_y = filter_ptr; - args.filter_n_offset = 0; - - for (size_t output_y = 0; output_y < n_h_body; output_y++) { - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_y_offset = 0; // ??? c, xtensa, tie 顺序不同 - args.filter_element = filter_ptr_y; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - c_impl_func_sp(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - input_y_real += args.input_stride_y_offset; - } - - for (size_t output_y = 0; output_y < n_h_tail; output_y++) { - args.filter_height = (args.padding_h_head + args.input_height - - (n_h_head + n_h_body + output_y) * args.stride_y + args.dilation_h - 1) / - args.dilation_h; - args.filter_n_offset = (filter_w * (filter_h - args.filter_height)) * filter_c_n_offset; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_y_offset = 0; // ??? c, xtensa, tie 顺序不同 - args.filter_element = filter_ptr_y; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - input_y_real += args.input_stride_y_offset; - } - tool::free_aligned(buffer); - } - } else { // padding valid - if (i_impl_func_sp) { - if (n_wise_tail) { - for (size_t output_y = 0; output_y < args.output_height; output_y++) { - feature_t *input_syx = input_ptr; - feature_t *output_yx = output_ptr; - - for (size_t output_x = 0; output_x < args.output_width; output_x++) { - i_impl_func_sp(output_yx, input_syx, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - - input_syx += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - input_ptr += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - } else { - for (size_t output_y = 0; output_y < args.output_height; output_y++) { - feature_t *input_syx = input_ptr; - feature_t *output_yx = output_ptr; - - for (size_t output_x = 0; output_x < args.output_width; output_x++) { - i_impl_func_sp(output_yx, input_syx, (void *const)&args); - input_syx += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - input_ptr += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - } - } else // run c_impl_func - { - buffer_t *buffer = - (buffer_t *)tool::calloc_aligned(args.output_channel, sizeof(buffer_t), 16, MALLOC_CAP_8BIT); - for (size_t output_y = 0; output_y < args.output_height; output_y++) { - feature_t *input_syx = input_ptr; - feature_t *output_yx = output_ptr; - - for (size_t output_x = 0; output_x < args.output_width; output_x++) { - c_impl_func_sp(buffer, input_syx, args); - n_wise_tail(output_yx, buffer, args); - - input_syx += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - input_ptr += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - tool::free_aligned(buffer); - } - } - - if (args.mac_shift == INT_MIN) { - tool::free_aligned(args.tie_filter_channel_factor); - tool::free_aligned(args.filter_channel_factor); - } - - if (args.debug_value) { - tool::free_aligned(args.debug_value); - args.debug_value = nullptr; - } - - return; -} - -template -std::vector> get_dwconv_operation_args(Tensor &output, - Tensor &input, - std::vector &padding, - const Filter &filter, - const int stride_y, - const int stride_x, - const Bias *bias = NULL, - const Activation *activate = NULL, - const int core_number = 1, - bool malloc_debug_memory = false) -{ - ArgsType args; - args.input_element = input.get_element_ptr(); - args.input_channel = input.shape[2]; - args.input_stride_y_offset = input.shape[1] * input.shape[2] * stride_y; - args.input_stride_x_offset = input.shape[2] * stride_x; - args.input_dilation_y_offset = input.shape[1] * input.shape[2] * filter.dilation[0]; - args.input_dilation_x_offset = input.shape[2] * filter.dilation[1]; - - args.output_element = output.get_element_ptr(); - args.output_height = output.shape[0]; - args.output_width = output.shape[1]; - args.output_channel = output.shape[2]; - args.output_y_offset = output.shape[1] * output.shape[2]; - args.output_x_offset = output.shape[2]; - - args.filter_element = filter.element; - args.filter_height = filter.shape[0]; - args.filter_width = filter.shape[1]; - args.filter_y_offset = 16; - args.filter_n_offset = 0; - - args.filter_y_offset_c = filter.shape[1] * filter.shape[2]; - args.filter_n_offset_c = args.filter_y_offset_c * filter.shape[0]; - - args.padding_h_head = padding[0]; - args.padding_h_tail = padding[1]; - args.padding_w_head = padding[2]; - args.padding_w_tail = padding[3]; - args.dilation_h = filter.dilation[0]; - args.dilation_w = filter.dilation[1]; - args.stride_x = stride_x; - args.stride_y = stride_y; - args.input_y_offset = input.shape[1] * input.shape[2]; - args.filter_c = filter.shape[3]; // dw: filter.shape[3]. conv: filter.shape[2]. - // args.filter_n = filter.shape[3]; - args.input_channel_with_padding = input.shape[2]; - - if (filter.exponent == INT_MIN && sizeof(feature_t) == 1) { // S8 per-channel quantization - args.mac_shift = INT_MIN; - - // calculate scale using filter.channel_exponent - args.tie_filter_channel_factor = - (int16_t *)tool::malloc_aligned(filter.channel_exponent_size, sizeof(int16_t), 16, MALLOC_CAP_8BIT); - int u = 16 / sizeof(feature_t); - // int len = filter.channel_exponent_size / u * u; - // depthwise_conv2d - for (int i = 0; i < filter.channel_exponent_size; i++) { - int tmp = output.exponent - filter.channel_exponent[i] - input.exponent; - args.tie_filter_channel_factor[i] = (int16_t)1 << (15 - tmp); - } - - args.filter_channel_factor = - (int16_t *)tool::malloc_aligned(filter.channel_exponent_size, sizeof(int16_t), 16, MALLOC_CAP_8BIT); - for (int i = 0; i < filter.channel_exponent_size; i++) { - args.filter_channel_factor[i] = output.exponent - filter.channel_exponent[i] - input.exponent; - } - } else { // per-layer quantization - args.mac_shift = output.exponent - filter.exponent - input.exponent; - } - - args.bias_element = bias ? bias->element : NULL; - args.activation_type = activate ? activate->type : Linear; - - switch (args.activation_type) { - case ReLU: - args.activation_alpha = 0; - args.activation_shift = 0; - args.activation_alpha_ptr = NULL; - break; - case LeakyReLU: - args.activation_alpha = activate->element[0]; - args.activation_shift = -activate->exponent; - args.activation_alpha_ptr = NULL; - break; - case PReLU: - args.activation_alpha_ptr = activate->element; - args.activation_shift = -activate->exponent; - break; - default: - args.activation_alpha_ptr = NULL; - args.activation_shift = -1; - break; - } - - // for ISA - args.c_rs1_1 = (input.shape[2] >> 1) - 1; - args.c_rs2_1 = (input.shape[2] >> 2) - 1; - int u = 16 / sizeof(feature_t); - args.n_div_x = output.shape[2] / u; - args.c_div_x_1 = input.shape[2] / u - 1; - - args.c_remainder = (args.input_channel % u) * sizeof(feature_t); - args.n_remainder = args.output_channel % u; - args.filter_w_rs1_1 = (filter.shape[1] >> 1) - 1; - - args.xtensa_dilation_x_offset = (filter.dilation[1] * input.shape[2] - input.shape[2]) * sizeof(feature_t); - args.xtensa_dilation_y_offset_stable = filter.dilation[0] * input.shape[2] * input.shape[1]; - args.xtensa_dilation_y_offset = (args.xtensa_dilation_y_offset_stable - input.shape[2] - - (filter.shape[1] - 1) * filter.dilation[1] * input.shape[2]) * - sizeof(feature_t); - - args.input_height = input.shape[0]; - args.input_width = input.shape[1]; - args.tie_depth2d_dilation_x_offset = filter.dilation[1] * input.shape[2] * sizeof(feature_t); - args.tie_depth2d_dilation_y_offset_stable = filter.dilation[0] * input.shape[2] * input.shape[1]; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - (filter.shape[1] - 1) * filter.dilation[1] * input.shape[2]) * - sizeof(feature_t); - - args.tie_depth2d_next_hwx1 = - (filter.shape[1] - 1) * filter.dilation[1] + (filter.shape[0] - 1) * filter.dilation[0] * input.shape[1]; - args.tie_depth2d_next_hwx1 = 16 - args.tie_depth2d_next_hwx1 * input.shape[2] * sizeof(feature_t); - - args.filter_y_offset_unaligned = 0; - args.filter_n_offset_unaligned = 0; - args.filter_element_unaligned = args.n_remainder - ? (filter.element + args.n_div_x * args.filter_height * args.filter_width * args.filter_c * u) - : filter.element; - - args.debug_value = nullptr; - if (malloc_debug_memory) { - args.debug_value = tool::calloc_aligned(16, sizeof(int8_t), 16, MALLOC_CAP_8BIT); - } - - // slice - std::vector> m_args(core_number, args); - if (core_number > 1) { - int output_y_slice = output.shape[0] / core_number; - int output_y_remained = output.shape[0]; - - // first slice - m_args[0].output_height = output_y_slice; - output_y_remained -= output_y_slice; - - // between slice - for (size_t i = 1; i < core_number - 1; i++) { - m_args[i].input_element = - m_args[i - 1].input_element + m_args[i - 1].output_height * args.input_stride_y_offset; - m_args[i].output_element = - m_args[i - 1].output_element + m_args[i - 1].output_height * args.output_y_offset; - m_args[i].output_height = output_y_slice; - output_y_remained -= output_y_slice; - } - - // last slice - m_args.back().input_element = - m_args[core_number - 2].input_element + m_args[core_number - 2].output_height * args.input_stride_y_offset; - m_args.back().output_element = - m_args[core_number - 2].output_element + m_args[core_number - 2].output_height * args.output_y_offset; - m_args.back().output_height = output_y_remained; - } - - return m_args; -} - -template -void dwconv_operation_shell(ArgsType &args, - void (*i_impl_func)(feature_t *, feature_t *, void *), - void (*i_impl_func_sp)(feature_t *, feature_t *, void *), - void (*c_impl_func)(buffer_t *, feature_t *, const ArgsType &), - void (*c_impl_func_sp)(buffer_t *, feature_t *, const ArgsType &), - void (*n_wise_tail)(feature_t *, buffer_t *, const ArgsType &)) -{ - feature_t *input_ptr = (feature_t *)args.input_element; - feature_t *output_ptr = (feature_t *)args.output_element; - if (args.padding_h_head || args.padding_w_head || args.padding_h_tail || args.padding_w_tail) { // padding same - int n_h_head = (args.padding_h_head + args.stride_y - 1) / args.stride_y; - int n_w_head = (args.padding_w_head + args.stride_x - 1) / args.stride_x; - int n_h_body = ((args.input_height + args.padding_h_head - args.dilation_h * (args.filter_height - 1) - 1) / - args.stride_y + - 1) - - n_h_head; - int n_w_body = - ((args.input_width + args.padding_w_head - args.dilation_w * (args.filter_width - 1) - 1) / args.stride_x + - 1) - - n_w_head; - int n_h_tail = args.output_height - n_h_head - n_h_body; - int n_w_tail = args.output_width - n_w_head - n_w_body; - int filter_h = args.filter_height; - int filter_w = args.filter_width; - feature_t *filter_ptr = (feature_t *)(args.filter_element); - - if (i_impl_func_sp) { - feature_t *input_y_real; - feature_t *input_x_real; - feature_t *filter_ptr_y; - feature_t *output_yx = output_ptr; - feature_t *filter_ptr_unaligned = (feature_t *)(args.filter_element_unaligned); - feature_t *filter_ptr_y_unaligned; - int unaligned_filter_c_n_offset = args.c_remainder; - int c_remainder_num = args.c_remainder / sizeof(feature_t); - int filter_c_n_offset = args.n_div_x ? args.filter_c * 16 : unaligned_filter_c_n_offset; - int filter_c_n_ptr_offset = filter_c_n_offset / sizeof(feature_t); - - if (n_wise_tail) { - for (size_t output_y = 0; output_y < n_h_head; output_y++) { - args.filter_height = filter_h - - ((args.padding_h_head - output_y * args.stride_y) + args.dilation_h - 1) / args.dilation_h; - input_y_real = input_ptr + - args.input_y_offset * - ((args.stride_y * output_y + (filter_h - args.filter_height) * args.dilation_h) - - args.padding_h_head); - filter_ptr_y = filter_ptr + (filter_h - args.filter_height) * filter_w * filter_c_n_ptr_offset; - filter_ptr_y_unaligned = - filter_ptr_unaligned + (filter_h - args.filter_height) * filter_w * c_remainder_num; - args.filter_n_offset = (filter_w * (filter_h - args.filter_height)) * filter_c_n_offset; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width + 1) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - args.filter_element_unaligned = - filter_ptr_y_unaligned + (filter_w - args.filter_width) * c_remainder_num; - - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - args.filter_y_offset = filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = 0; - args.filter_element = filter_ptr_y; - args.filter_element_unaligned = filter_ptr_y_unaligned; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - args.filter_y_offset = - (filter_w - args.filter_width + 1) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - } - - // h_body - args.filter_height = filter_h; - input_y_real = input_ptr + args.input_y_offset * ((args.stride_y * n_h_head) - args.padding_h_head); - filter_ptr_y = filter_ptr; - filter_ptr_y_unaligned = filter_ptr_unaligned; - args.filter_n_offset = 0; - for (size_t output_y = 0; output_y < n_h_body; output_y++) { - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width + 1) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - args.filter_element_unaligned = - filter_ptr_y_unaligned + (filter_w - args.filter_width) * c_remainder_num; - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - args.filter_y_offset = filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = 0; - args.filter_element = filter_ptr_y; - args.filter_element_unaligned = filter_ptr_y_unaligned; - - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func_sp(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - args.filter_y_offset = - (filter_w - args.filter_width + 1) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - input_y_real += args.input_stride_y_offset; - } - - // h_tail - for (size_t output_y = 0; output_y < n_h_tail; output_y++) { - args.filter_height = (args.padding_h_head + args.input_height - - (n_h_head + n_h_body + output_y) * args.stride_y + args.dilation_h - 1) / - args.dilation_h; - args.filter_n_offset = (filter_w * (filter_h - args.filter_height)) * filter_c_n_offset; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width + 1) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - args.filter_element_unaligned = - filter_ptr_y_unaligned + (filter_w - args.filter_width) * c_remainder_num; - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - args.filter_y_offset = filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = 0; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - args.filter_element = filter_ptr_y; - args.filter_element_unaligned = filter_ptr_y_unaligned; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - args.filter_y_offset = - (filter_w - args.filter_width + 1) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - i_impl_func(output_yx, input_x_real, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - input_y_real += args.input_stride_y_offset; - } - } else // without n_wise_tail - { - // h_head - for (size_t output_y = 0; output_y < n_h_head; output_y++) { - args.filter_height = filter_h - - ((args.padding_h_head - output_y * args.stride_y) + args.dilation_h - 1) / args.dilation_h; - input_y_real = input_ptr + - args.input_y_offset * - ((args.stride_y * output_y + (filter_h - args.filter_height) * args.dilation_h) - - args.padding_h_head); - filter_ptr_y = filter_ptr + (filter_h - args.filter_height) * filter_w * filter_c_n_ptr_offset; - filter_ptr_y_unaligned = - filter_ptr_unaligned + (filter_h - args.filter_height) * filter_w * c_remainder_num; - args.filter_n_offset = (filter_w * (filter_h - args.filter_height)) * filter_c_n_offset; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width + 1) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - args.filter_element_unaligned = - filter_ptr_y_unaligned + (filter_w - args.filter_width) * c_remainder_num; - - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - args.filter_y_offset = filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = 0; - args.filter_element = filter_ptr_y; - args.filter_element_unaligned = filter_ptr_y_unaligned; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - args.filter_y_offset = - (filter_w - args.filter_width + 1) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - } - - // h_body - args.filter_height = filter_h; - input_y_real = input_ptr + args.input_y_offset * ((args.stride_y * n_h_head) - args.padding_h_head); - filter_ptr_y = filter_ptr; - filter_ptr_y_unaligned = filter_ptr_unaligned; - args.filter_n_offset = 0; - for (size_t output_y = 0; output_y < n_h_body; output_y++) { - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width + 1) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - args.filter_element_unaligned = - filter_ptr_y_unaligned + (filter_w - args.filter_width) * c_remainder_num; - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - args.filter_y_offset = filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = 0; - args.filter_element = filter_ptr_y; - args.filter_element_unaligned = filter_ptr_y_unaligned; - - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func_sp(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - args.filter_y_offset = - (filter_w - args.filter_width + 1) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - input_y_real += args.input_stride_y_offset; - } - - // h_tail - for (size_t output_y = 0; output_y < n_h_tail; output_y++) { - args.filter_height = (args.padding_h_head + args.input_height - - (n_h_head + n_h_body + output_y) * args.stride_y + args.dilation_h - 1) / - args.dilation_h; - args.filter_n_offset = (filter_w * (filter_h - args.filter_height)) * filter_c_n_offset; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width + 1) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - args.filter_element_unaligned = - filter_ptr_y_unaligned + (filter_w - args.filter_width) * c_remainder_num; - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - args.filter_y_offset = filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = 0; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - args.filter_element = filter_ptr_y; - args.filter_element_unaligned = filter_ptr_y_unaligned; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_w_rs1_1 = (args.filter_width >> 1) - 1; - args.filter_y_offset = - (filter_w - args.filter_width + 1) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_y_offset_unaligned = (filter_w - args.filter_width) * unaligned_filter_c_n_offset; - args.tie_depth2d_dilation_y_offset = - (args.tie_depth2d_dilation_y_offset_stable - - (args.filter_width - 1) * args.dilation_w * args.input_channel_with_padding) * - sizeof(feature_t); - args.tie_depth2d_next_hwx1 = 16 - - ((args.filter_width - 1) * args.dilation_w + - (args.filter_height - 1) * args.dilation_h * args.input_width) * - args.input_channel_with_padding * sizeof(feature_t); - i_impl_func(output_yx, input_x_real, (void *const)&args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - input_y_real += args.input_stride_y_offset; - } - } - } else // run c_impl_func - { - buffer_t *buffer = - (buffer_t *)tool::calloc_aligned(args.output_channel, sizeof(buffer_t), 16, MALLOC_CAP_8BIT); - feature_t *input_y_real; - feature_t *input_x_real; - feature_t *filter_ptr_y; - feature_t *output_yx = output_ptr; - int filter_c_n_offset = args.input_channel; - int filter_c_n_ptr_offset = filter_c_n_offset; - - for (size_t output_y = 0; output_y < n_h_head; output_y++) { - args.filter_height = filter_h - - ((args.padding_h_head - output_y * args.stride_y) + args.dilation_h - 1) / args.dilation_h; - input_y_real = input_ptr + - args.input_y_offset * - ((args.stride_y * output_y + (filter_h - args.filter_height) * args.dilation_h) - - args.padding_h_head); - filter_ptr_y = filter_ptr + (filter_h - args.filter_height) * filter_w * filter_c_n_ptr_offset; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_y_offset = 0; // ??? c, xtensa, tie 顺序不同 - args.filter_element = filter_ptr_y; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - } - - args.filter_height = filter_h; - input_y_real = input_ptr + args.input_y_offset * ((args.stride_y * n_h_head) - args.padding_h_head); - filter_ptr_y = filter_ptr; - - for (size_t output_y = 0; output_y < n_h_body; output_y++) { - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_y_offset = 0; // ??? c, xtensa, tie 顺序不同 - args.filter_element = filter_ptr_y; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - c_impl_func_sp(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - input_y_real += args.input_stride_y_offset; - } - - for (size_t output_y = 0; output_y < n_h_tail; output_y++) { - args.filter_height = (args.padding_h_head + args.input_height - - (n_h_head + n_h_body + output_y) * args.stride_y + args.dilation_h - 1) / - args.dilation_h; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - - ((args.padding_w_head - output_x * args.stride_x) + args.dilation_w - 1) / args.dilation_w; - input_x_real = input_y_real + - args.input_channel * - ((args.stride_x * output_x + (filter_w - args.filter_width) * args.dilation_w) - - args.padding_w_head); - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - args.filter_element = filter_ptr_y + (filter_w - args.filter_width) * filter_c_n_ptr_offset; - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - } - - input_x_real = input_y_real + args.input_channel * (args.stride_x * n_w_head - args.padding_w_head); - args.filter_width = filter_w; - args.filter_y_offset = 0; // ??? c, xtensa, tie 顺序不同 - args.filter_element = filter_ptr_y; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = (args.padding_w_head + args.input_width - - (n_w_head + n_w_body + output_x) * args.stride_x + args.dilation_w - 1) / - args.dilation_w; - args.filter_y_offset = - (filter_w - args.filter_width) * filter_c_n_offset; // ??? c, xtensa, tie 顺序不同 - c_impl_func(buffer, input_x_real, args); - n_wise_tail(output_yx, buffer, args); - output_yx += args.output_x_offset; - input_x_real += args.input_stride_x_offset; - } - input_y_real += args.input_stride_y_offset; - } - tool::free_aligned(buffer); - } - } else { // padding valid - if (i_impl_func_sp) { - if (n_wise_tail) { - for (size_t output_y = 0; output_y < args.output_height; output_y++) { - feature_t *input_syx = input_ptr; - feature_t *output_yx = output_ptr; - - for (size_t output_x = 0; output_x < args.output_width; output_x++) { - i_impl_func_sp(output_yx, input_syx, (void *const)&args); - n_wise_tail(output_yx, NULL, args); - - input_syx += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - input_ptr += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - } else { - for (size_t output_y = 0; output_y < args.output_height; output_y++) { - feature_t *input_syx = input_ptr; - feature_t *output_yx = output_ptr; - - for (size_t output_x = 0; output_x < args.output_width; output_x++) { - i_impl_func_sp(output_yx, input_syx, (void *const)&args); - - input_syx += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - input_ptr += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - } - } else // run c_impl_func - { - args.filter_y_offset = 0; - buffer_t *buffer = - (buffer_t *)tool::calloc_aligned(args.output_channel, sizeof(buffer_t), 16, MALLOC_CAP_8BIT); - for (size_t output_y = 0; output_y < args.output_height; output_y++) { - feature_t *input_syx = input_ptr; - feature_t *output_yx = output_ptr; - - for (size_t output_x = 0; output_x < args.output_width; output_x++) { - c_impl_func_sp(buffer, input_syx, args); - n_wise_tail(output_yx, buffer, args); - - input_syx += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - input_ptr += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - tool::free_aligned(buffer); - } - } - - if (args.mac_shift == INT_MIN) { - tool::free_aligned(args.tie_filter_channel_factor); - tool::free_aligned(args.filter_channel_factor); - } - - if (args.debug_value) { - tool::free_aligned(args.debug_value); - args.debug_value = nullptr; - } - - return; -} - -typedef void (*i_impl_acti_s16_t)(int16_t *, int16_t *, void *); -typedef void (*c_impl_acti_s16_t)(int16_t *, int16_t *, const ArgsType &); - -typedef void (*i_impl_acti_s8_t)(int8_t *, int8_t *, void *); -typedef void (*c_impl_acti_s8_t)(int8_t *, int8_t *, const ArgsType &); - -template -std::vector> get_activation_args(Tensor &output, - Tensor &input, - const Activation *activate = NULL, - const int core_number = 1) -{ - ArgsType args; - args.input_element = input.get_element_ptr(); - args.input_channel = input.shape[2]; - // args.input_stride_y_offset = input.shape[1] * input.shape[2]; - args.input_stride_x_offset = input.shape[2]; - - args.output_element = output.get_element_ptr(); - args.output_height = output.shape[0]; - args.output_width = output.shape[1]; - args.output_channel = output.shape[2]; - // args.output_y_offset = output.shape[1] * output.shape[2]; - args.output_x_offset = output.shape[2]; - - args.activation_type = activate ? activate->type : Linear; - switch (args.activation_type) { - case ReLU: - args.activation_alpha = 0; - args.activation_shift = 0; - args.activation_alpha_ptr = NULL; - break; - case LeakyReLU: - args.activation_alpha = activate->element[0]; - args.activation_shift = -activate->exponent; - args.activation_alpha_ptr = NULL; - break; - case PReLU: - args.activation_alpha_ptr = activate->element; - args.activation_shift = -activate->exponent; - break; - default: - args.activation_alpha_ptr = NULL; - args.activation_shift = -1; - break; - } - // for ISA - int u = 16 / sizeof(feature_t); - args.c_div_x_1 = input.shape[2] / u - 1; - args.c_remainder = (args.input_channel % u) * sizeof(feature_t); - - int c_div_x = input.shape[2] / u; - args.c_rs1_1 = DL_MAX(c_div_x / 2 - 1, 0); // actually c / 2u - 1 - args.c_rs2_1 = c_div_x - 2 * args.c_rs1_1 - 1; // actually c left - 1 - - // TODO: slice - std::vector> m_args(core_number, args); - if (core_number > 1) { - } - - return m_args; -} - -template -std::vector> get_activation_args(TensorBase *output, - TensorBase *input, - const activation_type_t activate = Linear, - TensorBase *activation_alpha = nullptr, - const runtime_mode_t runtime_mode = RUNTIME_MODE_AUTO) -{ - ArgsType args; - args.input_element = (feature_t *)input->get_element_ptr(); - args.input_channel = input->shape[3]; - // args.input_stride_y_offset = input->shape[2] * input->shape[3]; - args.input_stride_x_offset = input->shape[3]; - - args.output_element = (feature_t *)output->get_element_ptr(); - args.output_height = output->shape[1]; - args.output_width = output->shape[2]; - args.output_channel = output->shape[3]; - // args.output_y_offset = output->shape[2] * output->shape[3]; - args.output_x_offset = output->shape[3]; - - args.output_scale = 1; - if (output->exponent == input->exponent) - args.output_shift = 0; - else - args.output_shift = output->exponent - input->exponent; - - if (args.output_shift < 0) { // ( * output_scale ) >> output_shift - args.output_scale = 1 << (-args.output_shift); - args.output_shift = 0; - } - - switch (activate) { - case ReLU: - args.activation_alpha = 0; - args.activation_shift = 0; - args.activation_alpha_ptr = NULL; - break; - case LeakyReLU: - args.activation_alpha = *((int *)activation_alpha->get_element_ptr()); - args.activation_shift = -activation_alpha->exponent; - args.activation_alpha_ptr = NULL; - break; - case PReLU: - args.activation_alpha_ptr = activation_alpha->get_element_ptr(); - args.activation_shift = output->exponent - input->exponent - activation_alpha->exponent; - break; - default: - args.activation_alpha_ptr = NULL; - args.activation_shift = -1; - break; - } - // for ISA - int u = 16 / sizeof(feature_t); - args.c_div_x_1 = input->shape[3] / u - 1; - args.c_remainder = (args.input_channel % u) * sizeof(feature_t); - - int c_div_x = input->shape[3] / u; - args.c_rs1_1 = DL_MAX(c_div_x / 2 - 1, 0); // actually c / 2u - 1 - args.c_rs2_1 = c_div_x - 2 * args.c_rs1_1 - 1; // actually c left - 1 - - // TODO: slice - std::vector> m_args(1, args); - if (runtime_mode == RUNTIME_MODE_MULTI_CORE) { - // TODO: - } - - return m_args; -} - -template -void activation_shell(const ArgsType &args, - void (*i_impl_func)(feature_t *, feature_t *, void *), - void (*c_impl_func)(feature_t *, feature_t *, const ArgsType &)) -{ - feature_t *input_ptr = (feature_t *)args.input_element; - feature_t *output_ptr = (feature_t *)args.output_element; - size_t loop_size = args.output_height * args.output_width; - - if (i_impl_func) { - for (size_t i = 0; i < loop_size; i++) { - i_impl_func(output_ptr, input_ptr, (void *const)&args); - input_ptr += args.input_stride_x_offset; - output_ptr += args.output_x_offset; - } - } else // run c_impl_func - { - for (size_t i = 0; i < loop_size; i++) { - c_impl_func(output_ptr, input_ptr, args); - input_ptr += args.input_stride_x_offset; - output_ptr += args.output_x_offset; - } - } - return; -} - -// For Arithmetic: Add, Sub, Mul etc -template -struct arithArgsType { - feature_t *input0_element; /* -std::vector> get_arith_operation_args(Tensor &output, - Tensor &input0, - Tensor &input1, - const Activation *activate = NULL, - const int core_number = 1, - const int output_exponent = INT_MIN) -{ - arithArgsType args; - // op between (h w c) and (1 1 c) is allowed. - bool is_hw_input0_11 = (input0.shape[0] == 1) && (input0.shape[1] == 1); - bool is_hw_input1_11 = (input1.shape[0] == 1) && (input1.shape[1] == 1); - bool is_same_channel_num = input0.shape[2] == input1.shape[2]; - bool is_11c_and_hwc = is_same_channel_num && (is_hw_input0_11 || is_hw_input1_11); - bool is_same_shape = input0.is_same_shape(input1); - assert(is_same_shape || is_11c_and_hwc); - if (is_same_shape) { - args.height = input0.shape[0]; // inputs and output are the same shape - args.width = input0.shape[1]; - args.channel = input0.shape[2]; - args.input0_x_offset = input0.shape[2]; - args.input1_x_offset = input1.shape[2]; - } else { - if (is_hw_input0_11) { - args.height = input1.shape[0]; - args.width = input1.shape[1]; - args.channel = input1.shape[2]; - args.input0_x_offset = 0; - args.input1_x_offset = input1.shape[2]; - } else { - args.height = input0.shape[0]; - args.width = input0.shape[1]; - args.channel = input0.shape[2]; - args.input0_x_offset = input0.shape[2]; - args.input1_x_offset = 0; - } - } - - args.input0_element = input0.get_element_ptr(); - // args.input0_y_offset = input0.shape[1] * input0.shape[2]; - - args.input1_element = input1.get_element_ptr(); - // args.input1_y_offset = input1.shape[1] * input1.shape[2]; - - args.output_element = output.get_element_ptr(); // output - // args.output_y_offset = output.shape[1] * output.shape[2]; - args.output_x_offset = output.shape[2]; - - args.rescale_input = 1; - args.rescale_output = 1; - args.output_scale = 1; - args.input_shift = 0; - args.output_shift = 0; - - int real_output_exponent = (output_exponent != INT_MIN) ? output_exponent : output.exponent; - args.mul_shift = real_output_exponent - input0.exponent - input1.exponent; - - if (input0.exponent == input1.exponent) { - args.rescale_input = 0; - if (input0.exponent == real_output_exponent) { - args.rescale_output = 0; - args.input_shift = -1; // do not need to rescale - } else { - args.output_shift = real_output_exponent - input0.exponent; // right shift - } - } else { - if (input0.exponent < input1.exponent) { - args.rescale_input = 2; - args.input_shift = input1.exponent - input0.exponent; // input0 * 2^(-input_shift) - args.output_shift = real_output_exponent - input1.exponent; - } else { - // default: args.rescale_input = 1; - args.input_shift = input0.exponent - input1.exponent; // input1 * 2^(-input_shift) - args.output_shift = real_output_exponent - input0.exponent; - } - } - if (args.output_shift < 0) { // ( * output_scale ) >> output_shift - args.output_scale = 1 << (-args.output_shift); - args.output_shift = 0; - } - - args.neg_output_scale = -args.output_scale; - args.activation_type = activate ? activate->type : Linear; - - switch (args.activation_type) { - case ReLU: - args.activation_alpha = 0; - args.activation_shift = 0; - args.activation_alpha_ptr = NULL; - break; - case LeakyReLU: - args.activation_alpha = activate->element[0]; - args.activation_shift = -activate->exponent; - args.activation_alpha_ptr = NULL; - break; - case PReLU: - args.activation_alpha_ptr = activate->element; - args.activation_shift = -activate->exponent; - break; - default: - args.activation_alpha_ptr = NULL; - args.activation_shift = -1; - break; - } - - // for ISA - int u = 16 / sizeof(feature_t); - int c_div_x = input0.shape[2] / u; - args.c_remainder = (args.channel % u) * sizeof(feature_t); - args.c_div_x_1 = c_div_x - 1; - args.c_div_2x_1 = DL_MAX(c_div_x / 2 - 1, 0); - args.c_left_x_1 = c_div_x - 2 * args.c_div_2x_1 - 1; - - // slice - std::vector> m_args(core_number, args); - if (core_number > 1) { - // TODO: - } - - return m_args; -} - -template -std::vector> get_arith_operation_args(TensorBase *output, - TensorBase *input0, - TensorBase *input1, - const activation_type_t activate = Linear, - TensorBase *activation_alpha = nullptr, - const runtime_mode_t runtime_mode = RUNTIME_MODE_AUTO) -{ - arithArgsType args; - // op between (h w c) and (1 1 c) is allowed. - bool is_hw_input0_11 = (input0->shape[1] == 1) && (input0->shape[2] == 1); - bool is_hw_input1_11 = (input1->shape[1] == 1) && (input1->shape[2] == 1); - bool is_same_channel_num = input0->shape[3] == input1->shape[3]; - bool is_11c_and_hwc = is_same_channel_num && (is_hw_input0_11 || is_hw_input1_11); - bool is_same_shape = input0->shape == input1->shape; - assert(is_same_shape || is_11c_and_hwc); - if (is_same_shape) { - args.height = input0->shape[1]; // inputs and output are the same shape - args.width = input0->shape[2]; - args.channel = input0->shape[3]; - args.input0_x_offset = input0->shape[3]; - args.input1_x_offset = input1->shape[3]; - } else { - if (is_hw_input0_11) { - args.height = input1->shape[1]; - args.width = input1->shape[2]; - args.channel = input1->shape[3]; - args.input0_x_offset = 0; - args.input1_x_offset = input1->shape[3]; - } else { - args.height = input0->shape[1]; - args.width = input0->shape[2]; - args.channel = input0->shape[3]; - args.input0_x_offset = input0->shape[3]; - args.input1_x_offset = 0; - } - } - - args.input0_element = (feature_t *)input0->get_element_ptr(); - // args.input0_y_offset = input0->shape[2] * input0->shape[3]; - - args.input1_element = (feature_t *)input1->get_element_ptr(); - // args.input1_y_offset = input1->shape[2] * input1->shape[3]; - - args.output_element = (feature_t *)output->get_element_ptr(); // output - // args.output_y_offset = output->shape[1] * output->shape[2]; - args.output_x_offset = output->shape[3]; - - args.rescale_input = 1; - args.rescale_output = 1; - args.output_scale = 1; - args.input_shift = 0; - args.output_shift = 0; - - args.mul_shift = output->exponent - input0->exponent - input1->exponent; - - if (input0->exponent == input1->exponent) { - args.rescale_input = 0; - if (input0->exponent == output->exponent) { - args.rescale_output = 0; - args.input_shift = -1; // do not need to rescale - } else { - args.output_shift = output->exponent - input0->exponent; // right shift - } - } else { - if (input0->exponent < input1->exponent) { - args.rescale_input = 2; - args.input_shift = input1->exponent - input0->exponent; // input0 * 2^(-input_shift) - args.output_shift = output->exponent - input1->exponent; - } else { - // default: args.rescale_input = 1; - args.input_shift = input0->exponent - input1->exponent; // input1 * 2^(-input_shift) - args.output_shift = output->exponent - input0->exponent; - } - } - if (args.output_shift < 0) { // ( * output_scale ) >> output_shift - args.output_scale = 1 << (-args.output_shift); - args.output_shift = 0; - } - - args.neg_output_scale = -args.output_scale; - args.activation_type = activate; - - switch (args.activation_type) { - case ReLU: - args.activation_alpha = 0; - args.activation_shift = 0; - args.activation_alpha_ptr = NULL; - break; - case LeakyReLU: - // ESP_LOGE(__FUNCTION__, "Do not support Leaky ReLU"); - // args.activation_alpha = activation_alpha->get_element_ptr()[0]; - // args.activation_shift = -activation_alpha->exponent; - // args.activation_alpha_ptr = NULL; - break; - case PReLU: - // ESP_LOGE(__FUNCTION__, "Do not support PReLU"); - // args.activation_alpha_ptr = activation_alpha->get_element_ptr(); //TODO: auto_split - // args.activation_shift = -activation_alpha->exponent; - break; - default: - args.activation_alpha_ptr = NULL; - args.activation_shift = -1; - break; - } - - // for ISA - int u = 16 / sizeof(feature_t); - int c_div_x = input0->shape[3] / u; - args.c_remainder = (args.channel % u) * sizeof(feature_t); - args.c_div_x_1 = c_div_x - 1; - args.c_div_2x_1 = DL_MAX(c_div_x / 2 - 1, 0); - args.c_left_x_1 = c_div_x - 2 * args.c_div_2x_1 - 1; - - // slice - std::vector> m_args(1, args); - if (runtime_mode == RUNTIME_MODE_MULTI_CORE) { - // TODO: - } - - return m_args; -} -typedef void (*arith_i_impl_func_s16_t)(int16_t *, int16_t *, int16_t *, void *); -typedef void (*arith_c_impl_func_s16_t)(int16_t *, int16_t *, int16_t *, const arithArgsType &); -typedef void (*arith_n_wise_tail_s16_t)(int16_t *, const arithArgsType &); - -typedef void (*arith_i_impl_func_s8_t)(int8_t *, int8_t *, int8_t *, void *); -typedef void (*arith_c_impl_func_s8_t)(int8_t *, int8_t *, int8_t *, const arithArgsType &); -typedef void (*arith_n_wise_tail_s8_t)(int8_t *, const arithArgsType &); - -template -void arith_operation_shell( - const arithArgsType &args, - void (*arith_i_impl_func)(feature_t *, feature_t *, feature_t *, void *), - void (*arith_c_impl_func)(feature_t *, feature_t *, feature_t *, const arithArgsType &), - void (*arith_n_wise_tail)(feature_t *, const arithArgsType &)) -{ - feature_t *input0_ptr = args.input0_element; - feature_t *input1_ptr = args.input1_element; - feature_t *output_ptr = args.output_element; - - arithArgsType activation_args; - activation_args.channel = args.channel; - activation_args.activation_type = args.activation_type; - activation_args.activation_alpha = args.activation_alpha; - activation_args.activation_shift = args.activation_shift; - activation_args.activation_alpha_ptr = args.activation_alpha_ptr; - size_t loop_size = args.height * args.width; - - if (arith_i_impl_func) { - if (arith_n_wise_tail) { - if (args.rescale_input < 2) { - for (size_t i = 0; i < loop_size; i++) { - arith_i_impl_func(output_ptr, input0_ptr, input1_ptr, (void *const)&args); - arith_n_wise_tail(output_ptr, activation_args); - input0_ptr += args.input0_x_offset; - input1_ptr += args.input1_x_offset; - output_ptr += args.output_x_offset; - } - } else { - for (size_t i = 0; i < loop_size; i++) { - arith_i_impl_func(output_ptr, input1_ptr, input0_ptr, (void *const)&args); - arith_n_wise_tail(output_ptr, activation_args); - input0_ptr += args.input0_x_offset; - input1_ptr += args.input1_x_offset; - output_ptr += args.output_x_offset; - } - } - } else { - if (args.rescale_input < 2) { - for (size_t i = 0; i < loop_size; i++) { - arith_i_impl_func(output_ptr, input0_ptr, input1_ptr, (void *const)&args); - input0_ptr += args.input0_x_offset; - input1_ptr += args.input1_x_offset; - output_ptr += args.output_x_offset; - } - } else { - for (size_t i = 0; i < loop_size; i++) { - arith_i_impl_func(output_ptr, input1_ptr, input0_ptr, (void *const)&args); - input0_ptr += args.input0_x_offset; - input1_ptr += args.input1_x_offset; - output_ptr += args.output_x_offset; - } - } - } - } else // run c_impl_func - { - if (arith_n_wise_tail) { - if (args.rescale_input < 2) { - for (size_t i = 0; i < loop_size; i++) { - arith_c_impl_func(output_ptr, input0_ptr, input1_ptr, args); - arith_n_wise_tail(output_ptr, activation_args); - input0_ptr += args.input0_x_offset; - input1_ptr += args.input1_x_offset; - output_ptr += args.output_x_offset; - } - } else { - for (size_t i = 0; i < loop_size; i++) { - arith_c_impl_func(output_ptr, input1_ptr, input0_ptr, args); - arith_n_wise_tail(output_ptr, activation_args); - input0_ptr += args.input0_x_offset; - input1_ptr += args.input1_x_offset; - output_ptr += args.output_x_offset; - } - } - } else { - if (args.rescale_input < 2) { - for (size_t i = 0; i < loop_size; i++) { - arith_c_impl_func(output_ptr, input0_ptr, input1_ptr, args); - input0_ptr += args.input0_x_offset; - input1_ptr += args.input1_x_offset; - output_ptr += args.output_x_offset; - } - } else { - for (size_t i = 0; i < loop_size; i++) { - arith_c_impl_func(output_ptr, input1_ptr, input0_ptr, args); - input0_ptr += args.input0_x_offset; - input1_ptr += args.input1_x_offset; - output_ptr += args.output_x_offset; - } - } - } - } - - return; -} - -template -struct resizeArgsType { - feature_t *input_element; /* -std::vector> get_resize_operation_args(Tensor &output, - Tensor &input, - resize_mode_t resize_type, - float scale_y, - float scale_x, - const int core_number = 1) -{ - resizeArgsType args; - args.input_element = input.get_element_ptr(); - args.input_height = input.shape[0]; // inputs and output are the same shape - args.input_width = input.shape[1]; - args.input_channel = input.shape[2]; - args.output_element = output.get_element_ptr(); // output - - args.resize_type = resize_type; - args.scale_y = scale_y; - args.scale_x = scale_x; - - args.output_shift = output.exponent - input.exponent; - args.output_scale = 1; - if (args.output_shift < 0) { // ( * output_scale ) >> output_shift - args.output_scale = 1 << (-args.output_shift); - args.output_shift = 0; - } - - // for ISA - int u = 16 / sizeof(feature_t); - args.c_div_x = input.shape[2] / u; - args.c_remainder = (args.input_channel % u) * sizeof(feature_t); - if (args.resize_type == RESIZE_NEAREST) { - if (args.scale_y == 2 && args.scale_x == 2) { - args.output_x_offset = args.input_channel; - args.output_y_offset = args.input_channel * args.input_width * 2; - } - } - - // slice - std::vector> m_args(core_number, args); - if (core_number > 1) { - // TODO: - } - - return m_args; -} - -template -std::vector> get_resize_operation_args(TensorBase *output, - TensorBase *input, - resize_mode_t resize_type, - float scale_y, - float scale_x, - const runtime_mode_t runtime_mode = RUNTIME_MODE_AUTO) -{ - resizeArgsType args; - args.input_element = (feature_t *)input->get_element_ptr(); - args.input_height = input->shape[1]; // inputs and output are the same shape - args.input_width = input->shape[2]; - args.input_channel = input->shape[3]; - args.output_element = (feature_t *)output->get_element_ptr(); // output - - args.resize_type = resize_type; - args.scale_y = scale_y; - args.scale_x = scale_x; - - args.output_shift = output->exponent - input->exponent; - args.output_scale = 1; - if (args.output_shift < 0) { // ( * output_scale ) >> output_shift - args.output_scale = 1 << (-args.output_shift); - args.output_shift = 0; - } - - // for ISA - int u = 16 / sizeof(feature_t); - args.c_div_x = input->shape[3] / u; - args.c_remainder = (args.input_channel % u) * sizeof(feature_t); - if (args.resize_type == RESIZE_NEAREST) { - if (args.scale_y == 2 && args.scale_x == 2) { - args.output_x_offset = args.input_channel; - args.output_y_offset = args.input_channel * args.input_width * 2; - } - } - - // slice - std::vector> m_args(1, args); - if (runtime_mode == RUNTIME_MODE_MULTI_CORE) { - // TODO: - } - - return m_args; -} - -typedef void (*resize_i_impl_func_s16_t)(int16_t *, int16_t *, void *); -typedef void (*resize_c_impl_func_s16_t)(int16_t *, int16_t *, const resizeArgsType &); - -typedef void (*resize_i_impl_func_s8_t)(int8_t *, int8_t *, void *); -typedef void (*resize_c_impl_func_s8_t)(int8_t *, int8_t *, const resizeArgsType &); - -template -void resize2d_operation_shell(const resizeArgsType &args, - void (*resize_i_impl_func)(feature_t *, feature_t *, void *), - void (*resize_c_impl_func)(feature_t *, feature_t *, const resizeArgsType &)) -{ - feature_t *input_ptr = args.input_element; - feature_t *output_ptr = args.output_element; - - if (resize_i_impl_func) { - if (args.resize_type == RESIZE_NEAREST) { - if (args.scale_y == 2 && args.scale_x == 2) { - for (int i = 0; i < args.input_height; i++) { - for (int j = 0; j < args.input_width; j++) { - resize_i_impl_func(output_ptr, input_ptr, (void *const)&args); - input_ptr += args.input_channel; - output_ptr += args.input_channel * 2; - } - output_ptr += args.input_channel * 2 * args.input_width; - } - } - } - } else // run c_impl_func - { - if (args.resize_type == RESIZE_NEAREST) { - if (args.scale_y == 2 && args.scale_x == 2) { - for (int i = 0; i < args.input_height; i++) { - for (int j = 0; j < args.input_width; j++) { - resize_c_impl_func(output_ptr, input_ptr, args); - input_ptr += args.input_channel; - output_ptr += args.input_channel * 2; - } - output_ptr += args.input_channel * 2 * args.input_width; - } - } - } - } - - return; -} - -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_activate_buffer.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_activate_buffer.hpp deleted file mode 100644 index 1684d04d..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_activate_buffer.hpp +++ /dev/null @@ -1,281 +0,0 @@ -#pragma once - -#include "dl_base.hpp" -#include "dl_define.hpp" -#include "dl_tool.hpp" -#include - -namespace dl { -namespace base { -template -inline void buffer_bias_linear(feature_t *output_ptr, buffer_t *buffer_ptr, const ArgsType &args) -{ - bias_t *bias_ptr = (bias_t *)args.bias_element; - if (args.mac_shift == INT_MIN) // per-channel - { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], 4); - // Bias - buffer_ptr[output_c] += bias_ptr[output_c]; - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.filter_channel_factor[output_c] - 4); - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } else { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.mac_shift); - // Bias - buffer_ptr[output_c] += bias_ptr[output_c]; - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } -} - -template -inline void buffer_bias_relu(feature_t *output_ptr, buffer_t *buffer_ptr, const ArgsType &args) -{ - bias_t *bias_ptr = (bias_t *)args.bias_element; - if (args.mac_shift == INT_MIN) // per-channel - { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], 4); - // Bias - buffer_ptr[output_c] += bias_ptr[output_c]; - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.filter_channel_factor[output_c] - 4); - // Activation - if (buffer_ptr[output_c] < 0) - buffer_ptr[output_c] = 0; - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } else { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.mac_shift); - // Bias - buffer_ptr[output_c] += bias_ptr[output_c]; - // Activation - if (buffer_ptr[output_c] < 0) - buffer_ptr[output_c] = 0; - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } -} - -template -inline void buffer_bias_leakyrelu(feature_t *output_ptr, buffer_t *buffer_ptr, const ArgsType &args) -{ - bias_t *bias_ptr = (bias_t *)args.bias_element; - if (args.mac_shift == INT_MIN) // per-channel - { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], 4); - // Bias - buffer_ptr[output_c] += bias_ptr[output_c]; - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.filter_channel_factor[output_c] - 4); - // Activation - if (buffer_ptr[output_c] < 0) { - buffer_ptr[output_c] *= args.activation_alpha; - buffer_ptr[output_c] >>= args.activation_shift; - } - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } else { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.mac_shift); - // Bias - buffer_ptr[output_c] += bias_ptr[output_c]; - // Activation - if (buffer_ptr[output_c] < 0) { - buffer_ptr[output_c] *= args.activation_alpha; - buffer_ptr[output_c] >>= args.activation_shift; - } - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } -} - -template -inline void buffer_bias_prelu(feature_t *output_ptr, buffer_t *buffer_ptr, const ArgsType &args) -{ - bias_t *bias_ptr = (bias_t *)args.bias_element; - feature_t *alpha_ptr = (feature_t *)args.activation_alpha_ptr; - if (args.mac_shift == INT_MIN) // per-channel - { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], 4); - // Bias - buffer_ptr[output_c] += bias_ptr[output_c]; - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.filter_channel_factor[output_c] - 4); - - // Activation - if (buffer_ptr[output_c] < 0) { - buffer_ptr[output_c] *= alpha_ptr[output_c]; - buffer_ptr[output_c] >>= args.activation_shift; - } - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - - buffer_ptr[output_c] = 0; - } - } else { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.mac_shift); - // Bias - buffer_ptr[output_c] += bias_ptr[output_c]; - // Activation - if (buffer_ptr[output_c] < 0) { - buffer_ptr[output_c] *= alpha_ptr[output_c]; - buffer_ptr[output_c] >>= args.activation_shift; - } - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - - buffer_ptr[output_c] = 0; - } - } -} - -/** - * @brief without bias - * - */ -template -inline void buffer_0000_linear(feature_t *output_ptr, buffer_t *buffer_ptr, const ArgsType &args) -{ - if (args.mac_shift == INT_MIN) // per-channel - { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.filter_channel_factor[output_c]); - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } else { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.mac_shift); - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } -} - -template -inline void buffer_0000_relu(feature_t *output_ptr, buffer_t *buffer_ptr, const ArgsType &args) -{ - if (args.mac_shift == INT_MIN) // per-channel - { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.filter_channel_factor[output_c]); - // Activation - if (buffer_ptr[output_c] < 0) - buffer_ptr[output_c] = 0; - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } else { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.mac_shift); - // Activation - if (buffer_ptr[output_c] < 0) - buffer_ptr[output_c] = 0; - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } -} - -template -inline void buffer_0000_leakyrelu(feature_t *output_ptr, buffer_t *buffer_ptr, const ArgsType &args) -{ - if (args.mac_shift == INT_MIN) // per-channel - { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.filter_channel_factor[output_c]); - // Activation - if (buffer_ptr[output_c] < 0) { - buffer_ptr[output_c] *= args.activation_alpha; - buffer_ptr[output_c] >>= args.activation_shift; - } - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } else { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.mac_shift); - // Activation - if (buffer_ptr[output_c] < 0) { - buffer_ptr[output_c] *= args.activation_alpha; - buffer_ptr[output_c] >>= args.activation_shift; - } - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } -} - -template -inline void buffer_0000_prelu(feature_t *output_ptr, buffer_t *buffer_ptr, const ArgsType &args) -{ - feature_t *alpha_ptr = (feature_t *)args.activation_alpha_ptr; - if (args.mac_shift == INT_MIN) // per-channel - { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.filter_channel_factor[output_c]); - // Activation - if (buffer_ptr[output_c] < 0) { - buffer_ptr[output_c] *= alpha_ptr[output_c]; - buffer_ptr[output_c] >>= args.activation_shift; - } - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } else { - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // right shift - buffer_ptr[output_c] = DL_RIGHT_SHIFT(buffer_ptr[output_c], args.mac_shift); - // Activation - if (buffer_ptr[output_c] < 0) { - buffer_ptr[output_c] *= alpha_ptr[output_c]; - buffer_ptr[output_c] >>= args.activation_shift; - } - - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } - } -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_activate_output.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_activate_output.hpp deleted file mode 100644 index dd7a130e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_activate_output.hpp +++ /dev/null @@ -1,90 +0,0 @@ -#pragma once - -#include "dl_define.hpp" -#include "dl_tool.hpp" -#include - -namespace dl { -namespace base { -template -inline void output_linear(feature_t *output_element, buffer_t *buffer, const ArgsType &args) -{ -} - -template -inline void output_relu(feature_t *output_element, buffer_t *buffer, const ArgsType &args) -{ - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - if (output_element[output_c] < 0) - output_element[output_c] = 0; - } -} - -template -inline void output_leakyrelu(feature_t *output_element, buffer_t *buffer, const ArgsType &args) -{ - int temp; - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - if (output_element[output_c] < 0) { - temp = DL_RIGHT_SHIFT((buffer_t)output_element[output_c] * (buffer_t)args.activation_alpha, - args.activation_shift); - tool::truncate(output_element[output_c], temp); - } - } -} - -template -inline void output_prelu(feature_t *output_element, buffer_t *buffer, const ArgsType &args) -{ - int temp; - activate_t *alpha_ptr = (activate_t *)args.activation_alpha_ptr; - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - // Activation - if (output_element[output_c] < 0) { - temp = DL_RIGHT_SHIFT((buffer_t)output_element[output_c] * (buffer_t)alpha_ptr[output_c], - args.activation_shift); - tool::truncate(output_element[output_c], temp); - } - } -} - -template -inline void arith_output_relu(feature_t *output_element, const arithArgsType &args) -{ - for (size_t output_c = 0; output_c < args.channel; output_c++) { - if (output_element[output_c] < 0) - output_element[output_c] = 0; - } -} - -template -inline void arith_output_leakyrelu(feature_t *output_element, const arithArgsType &args) -{ - int temp; - for (size_t output_c = 0; output_c < args.channel; output_c++) { - if (output_element[output_c] < 0) { - temp = DL_RIGHT_SHIFT((buffer_t)output_element[output_c] * (buffer_t)args.activation_alpha, - args.activation_shift); - tool::truncate(output_element[output_c], temp); - output_element[output_c] = (feature_t)temp; - } - } -} - -template -inline void arith_output_prelu(feature_t *output_element, const arithArgsType &args) -{ - int temp; - feature_t *alpha_ptr = (feature_t *)args.activation_alpha_ptr; - for (size_t output_c = 0; output_c < args.channel; output_c++) { - // Activation - if (output_element[output_c] < 0) { - temp = DL_RIGHT_SHIFT((buffer_t)output_element[output_c] * (buffer_t)alpha_ptr[output_c], - args.activation_shift); - tool::truncate(output_element[output_c], temp); - output_element[output_c] = (feature_t)temp; - } - } -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_add2d.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_add2d.cpp deleted file mode 100644 index 75840c76..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_add2d.cpp +++ /dev/null @@ -1,295 +0,0 @@ -#include "dl_base_add2d.hpp" - -#include "dl_base_activate_output.hpp" -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -inline void add2d_11c(feature_t *output_ptr, - feature_t *input0_ptr, - feature_t *input1_ptr, - const arithArgsType &args) -{ - buffer_t buffer; - for (size_t output_c = 0; output_c < args.channel; output_c++) // C - { - buffer = (buffer_t)input0_ptr[output_c] + (buffer_t)input1_ptr[output_c]; - tool::truncate(output_ptr[output_c], buffer); - } -} - -template -inline void add2d_11c_rescale(feature_t *output_ptr, - feature_t *input0_ptr, - feature_t *input1_ptr, - const arithArgsType &args) -{ - buffer_t buffer; - for (size_t output_c = 0; output_c < args.channel; output_c++) // C - { - buffer = (buffer_t)input0_ptr[output_c] + (buffer_t)(DL_RIGHT_SHIFT(input1_ptr[output_c], args.input_shift)); - buffer = DL_RIGHT_SHIFT(buffer * args.output_scale, args.output_shift); - tool::truncate(output_ptr[output_c], buffer); - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize add2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -inline void load_add2d_11c_s16(arith_i_impl_func_s16_t &i_impl_func, - arith_c_impl_func_s16_t &c_impl_func, - arith_n_wise_tail_s16_t &n_wise_tail, - const arithArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.input0_x_offset % 8 == 0 && args.input1_x_offset % 8 == 0 && args.output_x_offset % 8 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - - if (args.input_shift == -1) { - i_impl_func = dl_esp32p4_s16_add2d_11c; - } else - i_impl_func = dl_esp32p4_s16_rescale_add2d_11c; - break; - case ReLU: - case LeakyReLU: - if (args.input_shift == -1) { - i_impl_func = dl_esp32p4_s16_add2d_11c_relu; - } else - i_impl_func = dl_esp32p4_s16_rescale_add2d_11c_relu; - break; - case PReLU: - if (args.input_shift == -1) { - i_impl_func = dl_esp32p4_s16_add2d_11c_prelu; - } else - i_impl_func = dl_esp32p4_s16_rescale_add2d_11c_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_esp32p4_s16_unaligned_add2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_esp32p4_s16_unaligned_add2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_esp32p4_s16_unaligned_add2d_11c_prelu; - break; - } - } -#elif CONFIG_TIE728_BOOST - if (args.input0_x_offset % 8 == 0 && args.input1_x_offset % 8 == 0 && args.output_x_offset % 8 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - - if (args.input_shift == -1) { - i_impl_func = dl_tie728_s16_add2d_11c; - } else - i_impl_func = dl_tie728_s16_rescale_add2d_11c; - break; - case ReLU: - case LeakyReLU: - if (args.input_shift == -1) { - i_impl_func = dl_tie728_s16_add2d_11c_relu; - } else - i_impl_func = dl_tie728_s16_rescale_add2d_11c_relu; - break; - case PReLU: - if (args.input_shift == -1) { - i_impl_func = dl_tie728_s16_add2d_11c_prelu; - } else - i_impl_func = dl_tie728_s16_rescale_add2d_11c_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_tie728_s16_unaligned_add2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_tie728_s16_unaligned_add2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_tie728_s16_unaligned_add2d_11c_prelu; - break; - } - } - -#else - if (args.input_shift == -1) - c_impl_func = add2d_11c; - else - c_impl_func = add2d_11c_rescale; - - switch (args.activation_type) { - case Linear: - n_wise_tail = NULL; - break; - case ReLU: - n_wise_tail = arith_output_relu; - break; - case LeakyReLU: - n_wise_tail = arith_output_leakyrelu; - break; - case PReLU: - n_wise_tail = arith_output_prelu; - break; - } -#endif // CONFIG_TIE728_BOOST -} - -template <> -void add2d(void *const args_ptr) -{ - const arithArgsType &args = *((arithArgsType *)args_ptr); - - arith_i_impl_func_s16_t i_impl_func = NULL; - arith_c_impl_func_s16_t c_impl_func = NULL; - arith_n_wise_tail_s16_t n_wise_tail = NULL; - -#if CONFIG_ESP32P4_BOOST - dl_esp32p4_cfg_round(ROUND_MODE_HALF_EVEN); -#endif - - load_add2d_11c_s16(i_impl_func, c_impl_func, n_wise_tail, args); - - arith_operation_shell(args, i_impl_func, c_impl_func, n_wise_tail); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize add2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -inline void load_add2d_11c_s8(arith_i_impl_func_s8_t &i_impl_func, - arith_c_impl_func_s8_t &c_impl_func, - arith_n_wise_tail_s8_t &n_wise_tail, - const arithArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.input0_x_offset % 16 == 0 && args.input1_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - - if (args.input_shift == -1) { - i_impl_func = dl_esp32p4_s8_add2d_11c; - } else - i_impl_func = dl_esp32p4_s8_rescale_add2d_11c; - break; - case ReLU: - case LeakyReLU: - if (args.input_shift == -1) { - i_impl_func = dl_esp32p4_s8_add2d_11c_relu; - } else - i_impl_func = dl_esp32p4_s8_rescale_add2d_11c_relu; - break; - case PReLU: - if (args.input_shift == -1) { - i_impl_func = dl_esp32p4_s8_add2d_11c_prelu; - } else - i_impl_func = dl_esp32p4_s8_rescale_add2d_11c_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_esp32p4_s8_unaligned_add2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_esp32p4_s8_unaligned_add2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_esp32p4_s8_unaligned_add2d_11c_prelu; - break; - } - } -#elif CONFIG_TIE728_BOOST - if (args.input0_x_offset % 16 == 0 && args.input1_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - - if (args.input_shift == -1) { - i_impl_func = dl_tie728_s8_add2d_11c; - } else - i_impl_func = dl_tie728_s8_rescale_add2d_11c; - break; - case ReLU: - case LeakyReLU: - if (args.input_shift == -1) { - i_impl_func = dl_tie728_s8_add2d_11c_relu; - } else - i_impl_func = dl_tie728_s8_rescale_add2d_11c_relu; - break; - case PReLU: - if (args.input_shift == -1) { - i_impl_func = dl_tie728_s8_add2d_11c_prelu; - } else - i_impl_func = dl_tie728_s8_rescale_add2d_11c_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_tie728_s8_unaligned_add2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_tie728_s8_unaligned_add2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_tie728_s8_unaligned_add2d_11c_prelu; - break; - } - } -#else - if (args.input_shift == -1) - c_impl_func = add2d_11c; - else - c_impl_func = add2d_11c_rescale; - - switch (args.activation_type) { - case Linear: - n_wise_tail = NULL; - break; - case ReLU: - n_wise_tail = arith_output_relu; - break; - case LeakyReLU: - n_wise_tail = arith_output_leakyrelu; - break; - case PReLU: - n_wise_tail = arith_output_prelu; - break; - } -#endif // CONFIG_TIE728_BOOST -} - -template <> -void add2d(void *const args_ptr) -{ - const arithArgsType &args = *((arithArgsType *)args_ptr); - - arith_i_impl_func_s8_t i_impl_func = NULL; - arith_c_impl_func_s8_t c_impl_func = NULL; - arith_n_wise_tail_s8_t n_wise_tail = NULL; - -#if CONFIG_ESP32P4_BOOST - dl_esp32p4_cfg_round(ROUND_MODE_HALF_EVEN); -#endif - load_add2d_11c_s8(i_impl_func, c_impl_func, n_wise_tail, args); - - arith_operation_shell(args, i_impl_func, c_impl_func, n_wise_tail); -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_add2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_add2d.hpp deleted file mode 100644 index e36a1ce5..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_add2d.hpp +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief add2d - * - * @tparam feature_t - * @param args_ptr - */ -template -void add2d(void *const args_ptr); - -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_avg_pool2d.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_avg_pool2d.cpp deleted file mode 100644 index 1b32b25c..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_avg_pool2d.cpp +++ /dev/null @@ -1,125 +0,0 @@ -#include "dl_base_avg_pool2d.hpp" - -#include "dl_base_activate_buffer.hpp" -#include "dl_base_activate_output.hpp" -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -inline void avgpool2d_hwc1(buffer_t *buffer_ptr, - feature_t *input_ptr, - feature_t *output_ptr, - PoolArgsType &args) -{ - buffer_t input; - buffer_t avg_pool_area_inv = 1.f / args.avg_pool_area; - for (size_t filter_y = 0; filter_y < args.filter_height; filter_y++) // H - { // - feature_t *input_yx = input_ptr; - for (size_t filter_x = 0; filter_x < args.filter_width; filter_x++) // W - { // - for (size_t input_c = 0; input_c < args.input_channel; input_c++) // C - { - input = (buffer_t)input_yx[input_c] * DL_SCALE(args.input_exponent); - buffer_ptr[input_c] += input; - } - input_yx += args.input_x_offset; - } - input_ptr += args.input_y_offset; - } - - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - tool::truncate(output_ptr[output_c], - tool::round(buffer_ptr[output_c] * avg_pool_area_inv / DL_SCALE(args.output_exponent))); - buffer_ptr[output_c] = 0; - } -} - -inline void load_avg_pool2d_hwc1_s16(i_impl_func_s16_t &i_impl_func, - i_impl_func_s16_t &i_impl_func_sp, - avg_pool_c_impl_func_s16_t &c_impl_func, - PoolArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.input_x_offset % 8 == 0 && args.output_x_offset % 8 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s16_avg_pool2d_hwc1; - i_impl_func_sp = (args.filter_height == 2 && args.filter_width == 2) ? dl_tie728_s16_avg_pool2d_22c1 - : dl_tie728_s16_avg_pool2d_hwc1; - } else { - i_impl_func = dl_tie728_s16_unaligned_avg_pool2d_hwc1; - i_impl_func_sp = (args.filter_height == 2 && args.filter_width == 2) ? dl_tie728_s16_unaligned_avg_pool2d_22c1 - : dl_tie728_s16_unaligned_avg_pool2d_hwc1; - } -#else - c_impl_func = avgpool2d_hwc1; -#endif -} - -template <> -void avg_pool2d(void *args_ptr) -{ - PoolArgsType &args = *((PoolArgsType *)args_ptr); - - i_impl_func_s16_t i_impl_func = NULL; - i_impl_func_s16_t i_impl_func_sp = NULL; - avg_pool_c_impl_func_s16_t c_impl_func = NULL; - - load_avg_pool2d_hwc1_s16(i_impl_func, i_impl_func_sp, c_impl_func, args); - avg_pool_shell(args, i_impl_func, i_impl_func_sp, c_impl_func); -} - -inline void load_avg_pool2d_hwc1_s8(i_impl_func_s8_t &i_impl_func, - i_impl_func_s8_t &i_impl_func_sp, - avg_pool_c_impl_func_s8_t &c_impl_func, - PoolArgsType &args) -{ -#if CONFIG_ACCURATE_INFER - c_impl_func = avgpool2d_hwc1; -#else -#if CONFIG_ESP32P4_BOOST - if (args.input_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_esp32p4_s8_avg_pool2d_hwc1; - i_impl_func_sp = (args.filter_height == 2 && args.filter_width == 2) ? dl_esp32p4_s8_avg_pool2d_22c1 - : dl_esp32p4_s8_avg_pool2d_hwc1; - } else { - i_impl_func = dl_esp32p4_s8_unaligned_avg_pool2d_hwc1; - i_impl_func_sp = (args.filter_height == 2 && args.filter_width == 2) ? dl_esp32p4_s8_unaligned_avg_pool2d_22c1 - : dl_esp32p4_s8_unaligned_avg_pool2d_hwc1; - } -#elif CONFIG_TIE728_BOOST - if (args.input_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s8_avg_pool2d_hwc1; - i_impl_func_sp = (args.filter_height == 2 && args.filter_width == 2) ? dl_tie728_s8_avg_pool2d_22c1 - : dl_tie728_s8_avg_pool2d_hwc1; - } else { - i_impl_func = dl_tie728_s8_unaligned_avg_pool2d_hwc1; - i_impl_func_sp = (args.filter_height == 2 && args.filter_width == 2) ? dl_tie728_s8_unaligned_avg_pool2d_22c1 - : dl_tie728_s8_unaligned_avg_pool2d_hwc1; - } -#else - c_impl_func = avgpool2d_hwc1; -#endif -#endif -} - -template <> -void avg_pool2d(void *args_ptr) -{ - PoolArgsType &args = *((PoolArgsType *)args_ptr); - - i_impl_func_s8_t i_impl_func = NULL; - i_impl_func_s8_t i_impl_func_sp = NULL; - avg_pool_c_impl_func_s8_t c_impl_func = NULL; -#if CONFIG_ESP32P4_BOOST - dl_esp32p4_cfg_round(ROUND_MODE_HALF_EVEN); -#endif - - load_avg_pool2d_hwc1_s8(i_impl_func, i_impl_func_sp, c_impl_func, args); - avg_pool_shell(args, i_impl_func, i_impl_func_sp, c_impl_func); -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_avg_pool2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_avg_pool2d.hpp deleted file mode 100644 index 9a539954..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_avg_pool2d.hpp +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include "dl_base_pool2d.hpp" - -namespace dl { -namespace base { -/** - * @brief - * - * @tparam feature_t - * @param args_ptr - */ -template -void avg_pool2d(void *args_ptr); -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_conv2d.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_conv2d.cpp deleted file mode 100644 index 68957950..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_conv2d.cpp +++ /dev/null @@ -1,1436 +0,0 @@ -#include "dl_base_conv2d.hpp" - -#include "dl_base_activate_buffer.hpp" -#include "dl_base_activate_output.hpp" -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -inline void conv2d_11cn(buffer_t *buffer_ptr, feature_t *input_ptr, const ArgsType &args) -{ - const feature_t *filter_element = (const feature_t *)args.filter_element; - - // filter in sequence [H, W, C, N] - // for (size_t input_c = 0; input_c < args.input_channel; input_c++) - // { - // for (size_t output_c = 0; output_c < args.output_channel; output_c++) - // { - // buffer_ptr[output_c] += input_ptr[input_c] * (*filter_element++); - // } - // } - - // filter in sequence [N, H, W, C] - for (size_t output_c = 0; output_c < args.output_channel; output_c++) // N - { - buffer_t acc = 0; - for (size_t input_c = 0; input_c < args.input_channel; input_c++) // C - { - acc += input_ptr[input_c] * (*filter_element++); - } - buffer_ptr[output_c] = acc; - } -} - -template -inline void conv2d_33cn(buffer_t *buffer_ptr, feature_t *input_ptr, const ArgsType &args) -{ - // filter in sequence [H, W, C, N] - // const filter_t *filter_r0 = args.filter_element; - // const filter_t *filter_r1 = filter_r0 + args.filter_y_offset; - // const filter_t *filter_r2 = filter_r1 + args.filter_y_offset; - // feature_t *&input_syx_d0 = input_ptr; - - // for (size_t filter_x = 0; filter_x < 3; filter_x++) // W - // { // - // feature_t *input_syx_d1 = input_syx_d0 + args.input_dilation_y_offset; // - // feature_t *input_syx_d2 = input_syx_d1 + args.input_dilation_y_offset; // - // for (size_t input_c = 0; input_c < args.input_channel; input_c++) // C - // { // - // for (size_t output_c = 0; output_c < args.output_channel; output_c++) // N - // { - // buffer_ptr[output_c] += input_syx_d0[input_c] * (*filter_r0); - // buffer_ptr[output_c] += input_syx_d1[input_c] * (*filter_r1); - // buffer_ptr[output_c] += input_syx_d2[input_c] * (*filter_r2); - - // filter_r0++; - // filter_r1++; - // filter_r2++; - // } - // } - // input_syx_d0 += args.input_dilation_x_offset; - // } - - // filter in sequence [N, H, W, C] - feature_t *input_00 = input_ptr; - feature_t *input_01 = input_00 + args.input_dilation_x_offset; - feature_t *input_02 = input_01 + args.input_dilation_x_offset; - - feature_t *input_10 = input_00 + args.input_dilation_y_offset; - feature_t *input_11 = input_10 + args.input_dilation_x_offset; - feature_t *input_12 = input_11 + args.input_dilation_x_offset; - - feature_t *input_20 = input_10 + args.input_dilation_y_offset; - feature_t *input_21 = input_20 + args.input_dilation_x_offset; - feature_t *input_22 = input_21 + args.input_dilation_x_offset; - - const feature_t *filter_00 = (const feature_t *)args.filter_element; - const feature_t *filter_01 = filter_00 + args.input_channel; - const feature_t *filter_02 = filter_01 + args.input_channel; - - const feature_t *filter_10 = filter_00 + args.filter_y_offset_c; - const feature_t *filter_11 = filter_10 + args.input_channel; - const feature_t *filter_12 = filter_11 + args.input_channel; - - const feature_t *filter_20 = filter_10 + args.filter_y_offset_c; - const feature_t *filter_21 = filter_20 + args.input_channel; - const feature_t *filter_22 = filter_21 + args.input_channel; - - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - buffer_t acc = 0; - for (size_t input_c = 0; input_c < args.input_channel; input_c++) { - acc += input_00[input_c] * filter_00[input_c]; - acc += input_01[input_c] * filter_01[input_c]; - acc += input_02[input_c] * filter_02[input_c]; - acc += input_10[input_c] * filter_10[input_c]; - acc += input_11[input_c] * filter_11[input_c]; - acc += input_12[input_c] * filter_12[input_c]; - acc += input_20[input_c] * filter_20[input_c]; - acc += input_21[input_c] * filter_21[input_c]; - acc += input_22[input_c] * filter_22[input_c]; - } - filter_00 += args.filter_n_offset_c; - filter_01 += args.filter_n_offset_c; - filter_02 += args.filter_n_offset_c; - filter_10 += args.filter_n_offset_c; - filter_11 += args.filter_n_offset_c; - filter_12 += args.filter_n_offset_c; - filter_20 += args.filter_n_offset_c; - filter_21 += args.filter_n_offset_c; - filter_22 += args.filter_n_offset_c; - - buffer_ptr[output_c] = acc; - } -} - -template -inline void conv2d_hwcn(buffer_t *buffer_ptr, feature_t *input_ptr, const ArgsType &args) -{ - // filter in sequence [H, W, C, N] - // const filter_t *filter_element = args.filter_element; // Reload filter - // feature_t *&input_syx_dy = input_ptr; // - // for (size_t filter_y = 0; filter_y < args.filter_height; filter_y++) // H - // { // - // feature_t *input_syx_dyx = input_syx_dy; // - // for (size_t filter_x = 0; filter_x < args.filter_width; filter_x++) // W - // { // - // for (size_t input_c = 0; input_c < args.input_channel; input_c++) // C - // { // - // for (size_t output_c = 0; output_c < args.output_channel; output_c++) // N - // { - // buffer_ptr[output_c] += input_syx_dyx[input_c] * (*filter_element); - // filter_element++; - // } - // } - // input_syx_dyx += args.input_dilation_x_offset; - // } - // input_syx_dy += args.input_dilation_y_offset; - // } - - // filter in sequence [N, H, W, C] - const feature_t *filter_element = (const feature_t *)args.filter_element; - for (size_t output_c = 0; output_c < args.output_channel; output_c++) // N - { // - feature_t *input_syx_dy = input_ptr; // - buffer_t acc = 0; // - for (size_t filter_y = 0; filter_y < args.filter_height; filter_y++) // H - { // - feature_t *input_syx_dyx = input_syx_dy; // - for (size_t filter_x = 0; filter_x < args.filter_width; filter_x++) // W - { // - for (size_t input_c = 0; input_c < args.input_channel; input_c++) // C - { - acc += input_syx_dyx[input_c] * (*filter_element++); - } - input_syx_dyx += args.input_dilation_x_offset; - } - filter_element += args.filter_y_offset; - input_syx_dy += args.input_dilation_y_offset; - } - filter_element += args.filter_n_offset; - buffer_ptr[output_c] = acc; - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize conv2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -inline void load_conv2d_11cn_s16(i_impl_func_s16_t &i_impl_func, - i_impl_func_s16_t &i_impl_func_sp, - c_impl_func_s16_t &c_impl_func, - c_impl_func_s16_t &c_impl_func_sp, - n_wise_func_s16_t &n_wise_func, - const ArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.output_channel % 8 == 0 && args.input_channel % 8 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s16_conv2d_11cn_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s16_conv2d_11cn_bias_relu; - break; - case LeakyReLU: - // i_impl_func_sp = dl_esp32p4_s16_conv2d_11cn_bias_relu; - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s16_conv2d_11cn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s16_conv2d_11cn; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s16_conv2d_11cn_relu; - break; - case LeakyReLU: - // i_impl_func_sp = dl_esp32p4_s16_conv2d_11cn_relu; - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s16_conv2d_11cn_prelu; - break; - } - } - } else { - // if (args.bias_element) { - // switch (args.activation_type) - // { - // case Linear: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_11cn_bias; - // break; - // case ReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_11cn_bias_relu; - // break; - // case LeakyReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_11cn_bias_leakyrelu; - // break; - // case PReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_11cn_bias_prelu; - // break; - // } - // } else { - // switch (args.activation_type) - // { - // case Linear: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_11cn; - // break; - // case ReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_11cn_relu; - // break; - // case LeakyReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_11cn_leakyrelu; - // break; - // case PReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_11cn_prelu; - // break; - // } - // } - } - i_impl_func = i_impl_func_sp; - return; - -#elif CONFIG_TIE728_BOOST - if (args.output_channel % 8 == 0 && args.input_channel % 8 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_conv2d_11cn_bias; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_11cn_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_11cn_bias_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_11cn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_conv2d_11cn; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_11cn_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_11cn_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_11cn_prelu; - break; - } - } - } else { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_11cn_bias; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_11cn_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_11cn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_11cn; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_11cn_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_11cn_leakyrelu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_11cn_prelu; - break; - } - } - } - i_impl_func = i_impl_func_sp; - return; - -#elif CONFIG_XTENSA_BOOST - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_xtensa_s16_conv2d_11cn_bias; - break; - case ReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_11cn_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_11cn_bias; - n_wise_func = output_leakyrelu; - break; - case PReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_11cn_bias; - n_wise_func = output_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_xtensa_s16_conv2d_11cn; - break; - case ReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_11cn_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_11cn; - n_wise_func = output_leakyrelu; - break; - case PReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_11cn; - n_wise_func = output_prelu; - break; - } - } - i_impl_func = i_impl_func_sp; - return; - -#else // C/C++ implementation - c_impl_func_sp = conv2d_11cn; - c_impl_func = c_impl_func_sp; - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_bias_linear; - break; - case ReLU: - n_wise_func = buffer_bias_relu; - break; - case LeakyReLU: - n_wise_func = buffer_bias_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_0000_linear; - break; - case ReLU: - n_wise_func = buffer_0000_relu; - break; - case LeakyReLU: - n_wise_func = buffer_0000_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_0000_prelu; - break; - } - } - return; -#endif -} - -inline void load_conv2d_33cn_s16(i_impl_func_s16_t &i_impl_func, - i_impl_func_s16_t &i_impl_func_sp, - c_impl_func_s16_t &c_impl_func, - c_impl_func_s16_t &c_impl_func_sp, - n_wise_func_s16_t &n_wise_func, - const ArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.output_channel % 8 == 0 && args.input_channel % 8 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s16_conv2d_33cn_bias; - i_impl_func = dl_esp32p4_s16_conv2d_hwcn_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s16_conv2d_33cn_bias_relu; - i_impl_func = dl_esp32p4_s16_conv2d_hwcn_bias_relu; - break; - case LeakyReLU: - // i_impl_func_sp = dl_esp32p4_s16_conv2d_33cn_bias_relu; - // i_impl_func = dl_esp32p4_s16_conv2d_hwcn_bias_relu; - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s16_conv2d_33cn_bias_prelu; - // i_impl_func = dl_esp32p4_s16_conv2d_hwcn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s16_conv2d_33cn; - i_impl_func = dl_esp32p4_s16_conv2d_hwcn; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s16_conv2d_33cn_relu; - i_impl_func = dl_esp32p4_s16_conv2d_hwcn_relu; - break; - case LeakyReLU: - // i_impl_func_sp = dl_esp32p4_s16_conv2d_33cn_relu; - // i_impl_func = dl_esp32p4_s16_conv2d_hwcn_relu; - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s16_conv2d_33cn_prelu; - // i_impl_func = dl_esp32p4_s16_conv2d_hwcn_prelu; - break; - } - } - } else { - // if (args.bias_element) - // { - // switch (args.activation_type) - // { - // case Linear: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_33cn_bias; - // i_impl_func = dl_esp32p4_s16_unaligned_conv2d_hwcn_bias; - // break; - // case ReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_33cn_bias_relu; - // i_impl_func = dl_esp32p4_s16_unaligned_conv2d_hwcn_bias_relu; - // break; - // case LeakyReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_33cn_bias_leakyrelu; - // i_impl_func = dl_esp32p4_s16_unaligned_conv2d_hwcn_bias_leakyrelu; - // break; - // case PReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_33cn_bias_prelu; - // i_impl_func = dl_esp32p4_s16_unaligned_conv2d_hwcn_bias_prelu; - // break; - // } - // } - // else - // { - // switch (args.activation_type) - // { - // case Linear: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_33cn; - // i_impl_func = dl_esp32p4_s16_unaligned_conv2d_hwcn; - // break; - // case ReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_33cn_relu; - // i_impl_func = dl_esp32p4_s16_unaligned_conv2d_hwcn_relu; - // break; - // case LeakyReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_33cn_leakyrelu; - // i_impl_func = dl_esp32p4_s16_unaligned_conv2d_hwcn_leakyrelu; - // break; - // case PReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_33cn_prelu; - // i_impl_func = dl_esp32p4_s16_unaligned_conv2d_hwcn_prelu; - // break; - // } - // } - } - return; - -#elif CONFIG_TIE728_BOOST - if (args.output_channel % 8 == 0 && args.input_channel % 8 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_conv2d_33cn_bias; - i_impl_func = dl_tie728_s16_conv2d_hwcn_bias; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_33cn_bias_relu; - i_impl_func = dl_tie728_s16_conv2d_hwcn_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_33cn_bias_relu; - i_impl_func = dl_tie728_s16_conv2d_hwcn_bias_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_33cn_bias_prelu; - i_impl_func = dl_tie728_s16_conv2d_hwcn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_conv2d_33cn; - i_impl_func = dl_tie728_s16_conv2d_hwcn; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_33cn_relu; - i_impl_func = dl_tie728_s16_conv2d_hwcn_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_33cn_relu; - i_impl_func = dl_tie728_s16_conv2d_hwcn_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_33cn_prelu; - i_impl_func = dl_tie728_s16_conv2d_hwcn_prelu; - break; - } - } - } else { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_33cn_bias; - i_impl_func = dl_tie728_s16_unaligned_conv2d_hwcn_bias; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_33cn_bias_relu; - i_impl_func = dl_tie728_s16_unaligned_conv2d_hwcn_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu; - i_impl_func = dl_tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_33cn_bias_prelu; - i_impl_func = dl_tie728_s16_unaligned_conv2d_hwcn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_33cn; - i_impl_func = dl_tie728_s16_unaligned_conv2d_hwcn; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_33cn_relu; - i_impl_func = dl_tie728_s16_unaligned_conv2d_hwcn_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_33cn_leakyrelu; - i_impl_func = dl_tie728_s16_unaligned_conv2d_hwcn_leakyrelu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_33cn_prelu; - i_impl_func = dl_tie728_s16_unaligned_conv2d_hwcn_prelu; - break; - } - } - } - return; - -#elif CONFIG_XTENSA_BOOST - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_xtensa_s16_conv2d_33cn_bias; - i_impl_func = dl_xtensa_s16_conv2d_hwcn_bias; - break; - case ReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_33cn_bias_relu; - i_impl_func = dl_xtensa_s16_conv2d_hwcn_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_33cn_bias; - i_impl_func = dl_xtensa_s16_conv2d_hwcn_bias; - n_wise_func = output_leakyrelu; - break; - case PReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_33cn_bias; - i_impl_func = dl_xtensa_s16_conv2d_hwcn_bias; - n_wise_func = output_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_xtensa_s16_conv2d_33cn; - i_impl_func = dl_xtensa_s16_conv2d_hwcn; - break; - case ReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_33cn_relu; - i_impl_func = dl_xtensa_s16_conv2d_hwcn_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_33cn; - i_impl_func = dl_xtensa_s16_conv2d_hwcn; - n_wise_func = output_leakyrelu; - break; - case PReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_33cn; - i_impl_func = dl_xtensa_s16_conv2d_hwcn; - n_wise_func = output_prelu; - break; - } - } - return; - -#else // C/C++ implementation - c_impl_func_sp = conv2d_33cn; - c_impl_func = conv2d_hwcn; - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_bias_linear; - break; - case ReLU: - n_wise_func = buffer_bias_relu; - break; - case LeakyReLU: - n_wise_func = buffer_bias_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_0000_linear; - break; - case ReLU: - n_wise_func = buffer_0000_relu; - break; - case LeakyReLU: - n_wise_func = buffer_0000_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_0000_prelu; - break; - } - } - return; -#endif -} - -inline void load_conv2d_hwcn_s16(i_impl_func_s16_t &i_impl_func, - i_impl_func_s16_t &i_impl_func_sp, - c_impl_func_s16_t &c_impl_func, - c_impl_func_s16_t &c_impl_func_sp, - n_wise_func_s16_t &n_wise_func, - const ArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.output_channel % 8 == 0 && args.input_channel % 8 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s16_conv2d_hwcn_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s16_conv2d_hwcn_bias_relu; - break; - case LeakyReLU: - // i_impl_func_sp = dl_esp32p4_s16_conv2d_hwcn_bias_relu; - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s16_conv2d_hwcn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s16_conv2d_hwcn; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s16_conv2d_hwcn_relu; - break; - case LeakyReLU: - // i_impl_func_sp = dl_esp32p4_s16_conv2d_hwcn_relu; - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s16_conv2d_hwcn_prelu; - break; - } - } - } else { - // if (args.bias_element) - // { - // switch (args.activation_type) - // { - // case Linear: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_hwcn_bias; - // break; - // case ReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_hwcn_bias_relu; - // break; - // case LeakyReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_hwcn_bias_leakyrelu; - // break; - // case PReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_hwcn_bias_prelu; - // break; - // } - // } - // else - // { - // switch (args.activation_type) - // { - // case Linear: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_hwcn; - // break; - // case ReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_hwcn_relu; - // break; - // case LeakyReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_hwcn_leakyrelu; - // break; - // case PReLU: - // i_impl_func_sp = dl_esp32p4_s16_unaligned_conv2d_hwcn_prelu; - // break; - // } - // } - } - i_impl_func = i_impl_func_sp; - return; - -#elif CONFIG_TIE728_BOOST - if (args.output_channel % 8 == 0 && args.input_channel % 8 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_conv2d_hwcn_bias; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_hwcn_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_hwcn_bias_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_hwcn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_conv2d_hwcn; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_hwcn_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_hwcn_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_conv2d_hwcn_prelu; - break; - } - } - } else { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_hwcn_bias; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_hwcn_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_hwcn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_hwcn; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_hwcn_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_hwcn_leakyrelu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_conv2d_hwcn_prelu; - break; - } - } - } - i_impl_func = i_impl_func_sp; - return; - -#elif CONFIG_XTENSA_BOOST - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_xtensa_s16_conv2d_hwcn_bias; - break; - case ReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_hwcn_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_hwcn_bias; - n_wise_func = output_leakyrelu; - break; - case PReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_hwcn_bias; - n_wise_func = output_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_xtensa_s16_conv2d_hwcn; - break; - case ReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_hwcn_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_hwcn; - n_wise_func = output_leakyrelu; - break; - case PReLU: - i_impl_func_sp = dl_xtensa_s16_conv2d_hwcn; - n_wise_func = output_prelu; - break; - } - } - i_impl_func = i_impl_func_sp; - return; - -#else // C/C++ implement - c_impl_func_sp = conv2d_hwcn; - c_impl_func = c_impl_func_sp; - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_bias_linear; - break; - case ReLU: - n_wise_func = buffer_bias_relu; - break; - case LeakyReLU: - n_wise_func = buffer_bias_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_0000_linear; - break; - case ReLU: - n_wise_func = buffer_0000_relu; - break; - case LeakyReLU: - n_wise_func = buffer_0000_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_0000_prelu; - break; - } - } - return; -#endif -} - -template <> -void conv2d(void *args_ptr) -{ - ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_func_s16_t i_impl_func = NULL; - i_impl_func_s16_t i_impl_func_sp = NULL; - c_impl_func_s16_t c_impl_func = NULL; - c_impl_func_s16_t c_impl_func_sp = NULL; - n_wise_func_s16_t n_wise_func = NULL; - - if (args.filter_height == 1 && args.filter_width == 1) // Filter shape = [1, 1, C, N] - { - load_conv2d_11cn_s16(i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func, args); - } else if (args.filter_height == 3 && args.filter_width == 3) // Filter shape = [3, 3, C, N] - { - load_conv2d_33cn_s16(i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func, args); - } else // Filter shape = [H, W, C, N] - { - load_conv2d_hwcn_s16(i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func, args); - } - conv_operation_shell( - args, i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func); -} - -template <> -void conv2d(void *args_ptr) -{ - ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_func_s16_t i_impl_func = NULL; - i_impl_func_s16_t i_impl_func_sp = NULL; - c_impl_func_s16_t c_impl_func = NULL; - c_impl_func_s16_t c_impl_func_sp = NULL; - n_wise_func_s16_t n_wise_func = NULL; - -#if CONFIG_ESP32P4_BOOST - dl_esp32p4_cfg_round(ROUND_MODE_HALF_EVEN); -#endif - - if (args.filter_height == 1 && args.filter_width == 1) // Filter shape = [1, 1, C, N] - { - load_conv2d_11cn_s16(i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func, args); - } else if (args.filter_height == 3 && args.filter_width == 3) // Filter shape = [3, 3, C, N] - { - load_conv2d_33cn_s16(i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func, args); - } else // Filter shape = [H, W, C, N] - { - load_conv2d_hwcn_s16(i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func, args); - } - - conv_operation_shell(args, i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize conv2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -inline void load_conv2d_11cn_s8(i_impl_func_s8_t &i_impl_func, - i_impl_func_s8_t &i_impl_func_sp, - const ArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.output_channel % 16 == 0 && args.input_channel % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_conv2d_11cn_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_conv2d_11cn_bias_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_conv2d_11cn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_conv2d_11cn; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_conv2d_11cn_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_conv2d_11cn_prelu; - break; - } - } - } else { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_11cn_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_11cn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_11cn; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_11cn_leakyrelu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_11cn_prelu; - break; - } - } - } - i_impl_func = i_impl_func_sp; - return; -#elif CONFIG_TIE728_BOOST - if (args.output_channel % 16 == 0 && args.input_channel % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s8_conv2d_11cn; - break; - case ReLU: - case LeakyReLU: - i_impl_func_sp = dl_tie728_s8_conv2d_11cn_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s8_conv2d_11cn_prelu; - break; - } - } else { - i_impl_func_sp = dl_tie728_s8_unaligned_conv2d_11cn; - } - i_impl_func = i_impl_func_sp; - return; -#endif -} - -inline void load_conv2d_33cn_s8(i_impl_func_s8_t &i_impl_func, - i_impl_func_s8_t &i_impl_func_sp, - const ArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.output_channel % 16 == 0 && args.input_channel % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_conv2d_33cn_bias; - i_impl_func = dl_esp32p4_s8_conv2d_hwcn_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_conv2d_33cn_bias_relu; - i_impl_func = dl_esp32p4_s8_conv2d_hwcn_bias_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_conv2d_33cn_bias_prelu; - // i_impl_func = dl_esp32p4_s8_conv2d_hwcn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_conv2d_33cn; - i_impl_func = dl_esp32p4_s8_conv2d_hwcn; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_conv2d_33cn_relu; - i_impl_func = dl_esp32p4_s8_conv2d_hwcn_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_conv2d_33cn_prelu; - // i_impl_func = dl_esp32p4_s8_conv2d_hwcn_prelu; - break; - } - } - } else { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_33cn_bias; - i_impl_func = dl_esp32p4_s8_unaligned_conv2d_hwcn_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu; - i_impl_func = dl_esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_33cn_bias_prelu; - // i_impl_func = dl_esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_33cn; - i_impl_func = dl_esp32p4_s8_unaligned_conv2d_hwcn; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_33cn_leakyrelu; - i_impl_func = dl_esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_33cn_prelu; - // i_impl_func = dl_esp32p4_s8_unaligned_conv2d_hwcn_prelu; - break; - } - } - } - return; -#elif CONFIG_TIE728_BOOST - if (args.output_channel % 16 == 0 && args.input_channel % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s8_conv2d_33cn; - i_impl_func = dl_tie728_s8_conv2d_hwcn; - break; - case ReLU: - case LeakyReLU: - i_impl_func_sp = dl_tie728_s8_conv2d_33cn_relu; - i_impl_func = dl_tie728_s8_conv2d_hwcn_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s8_conv2d_33cn_prelu; - i_impl_func = dl_tie728_s8_conv2d_hwcn_prelu; - break; - } - } else { - i_impl_func_sp = dl_tie728_s8_unaligned_conv2d_33cn; - i_impl_func = dl_tie728_s8_unaligned_conv2d_hwcn; - } - return; -#endif -} - -inline void load_conv2d_hwcn_s8(i_impl_func_s8_t &i_impl_func, - i_impl_func_s8_t &i_impl_func_sp, - const ArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.output_channel % 16 == 0 && args.input_channel % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_conv2d_hwcn_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_conv2d_hwcn_bias_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_conv2d_hwcn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_conv2d_hwcn; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_conv2d_hwcn_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_conv2d_hwcn_prelu; - break; - } - } - } else { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_hwcn_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_hwcn; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_unaligned_conv2d_hwcn_prelu; - break; - } - } - } - i_impl_func = i_impl_func_sp; - return; -#elif CONFIG_TIE728_BOOST - if (args.output_channel % 16 == 0 && args.input_channel % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s8_conv2d_hwcn; - break; - case ReLU: - case LeakyReLU: - i_impl_func_sp = dl_tie728_s8_conv2d_hwcn_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s8_conv2d_hwcn_prelu; - break; - } - } else { - i_impl_func_sp = dl_tie728_s8_unaligned_conv2d_hwcn; - } - i_impl_func = i_impl_func_sp; - return; -#endif -} - -inline void load_conv2d_s8_per_tensor_c_func(c_impl_func_s8_t &c_impl_func, - c_impl_func_s8_t &c_impl_func_sp, - n_wise_func_s8_t &n_wise_func, - const ArgsType &args) -{ - if (args.filter_height == 1 && args.filter_width == 1) // Filter shape = [1, 1, C, N] - { - c_impl_func_sp = conv2d_11cn; - c_impl_func = c_impl_func_sp; - } else if (args.filter_height == 3 && args.filter_width == 3) // Filter shape = [3, 3, C, N] - { - c_impl_func_sp = conv2d_33cn; - c_impl_func = conv2d_hwcn; - } else // Filter shape = [H, W, C, N] - { - c_impl_func_sp = conv2d_hwcn; - c_impl_func = c_impl_func_sp; - } - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_bias_linear; - break; - case ReLU: - n_wise_func = buffer_bias_relu; - break; - case LeakyReLU: - n_wise_func = buffer_bias_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_0000_linear; - break; - case ReLU: - n_wise_func = buffer_0000_relu; - break; - case LeakyReLU: - n_wise_func = buffer_0000_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_0000_prelu; - break; - } - } -} - -inline void load_conv2d_s8_per_channel_c_func(c_impl_func_s8_t &c_impl_func, - c_impl_func_s8_t &c_impl_func_sp, - n_wise_func_s8_t &n_wise_func, - const ArgsType &args) -{ - if (args.filter_height == 1 && args.filter_width == 1) // Filter shape = [1, 1, C, N] - { - c_impl_func_sp = conv2d_11cn; - c_impl_func = c_impl_func_sp; - } else if (args.filter_height == 3 && args.filter_width == 3) // Filter shape = [3, 3, C, N] - { - c_impl_func_sp = conv2d_33cn; - c_impl_func = conv2d_hwcn; - } else // Filter shape = [H, W, C, N] - { - c_impl_func_sp = conv2d_hwcn; - c_impl_func = c_impl_func_sp; - } - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_bias_linear; - break; - case ReLU: - n_wise_func = buffer_bias_relu; - break; - case LeakyReLU: - n_wise_func = buffer_bias_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_0000_linear; - break; - case ReLU: - n_wise_func = buffer_0000_relu; - break; - case LeakyReLU: - n_wise_func = buffer_0000_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_0000_prelu; - break; - } - } -} - -template <> -void conv2d(void *args_ptr) -{ - ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_func_s8_t i_impl_func = NULL; - i_impl_func_s8_t i_impl_func_sp = NULL; - c_impl_func_s8_t c_impl_func = NULL; - c_impl_func_s8_t c_impl_func_sp = NULL; - n_wise_func_s8_t n_wise_func = NULL; - -#if CONFIG_TIE728_BOOST - if (args.filter_height == 1 && args.filter_width == 1) // Filter shape = [1, 1, C, N] - { - load_conv2d_11cn_s8(i_impl_func, i_impl_func_sp, args); - } else if (args.filter_height == 3 && args.filter_width == 3) // Filter shape = [3, 3, C, N] - { - load_conv2d_33cn_s8(i_impl_func, i_impl_func_sp, args); - } else // Filter shape = [H, W, C, N] - { - load_conv2d_hwcn_s8(i_impl_func, i_impl_func_sp, args); - } -#else - load_conv2d_s8_per_tensor_c_func(c_impl_func, c_impl_func_sp, n_wise_func, args); -#endif - conv_operation_shell(args, i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func); -} - -template <> -void conv2d(void *args_ptr) -{ - ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_func_s8_t i_impl_func = NULL; - i_impl_func_s8_t i_impl_func_sp = NULL; - c_impl_func_s8_t c_impl_func = NULL; - c_impl_func_s8_t c_impl_func_sp = NULL; - n_wise_func_s8_t n_wise_func = NULL; - -#if CONFIG_TIE728_BOOST - if (args.filter_height == 1 && args.filter_width == 1) // Filter shape = [1, 1, C, N] - { - load_conv2d_11cn_s8(i_impl_func, i_impl_func_sp, args); - } else if (args.filter_height == 3 && args.filter_width == 3) // Filter shape = [3, 3, C, N] - { - load_conv2d_33cn_s8(i_impl_func, i_impl_func_sp, args); - } else // Filter shape = [H, W, C, N] - { - load_conv2d_hwcn_s8(i_impl_func, i_impl_func_sp, args); - } -#else - load_conv2d_s8_per_channel_c_func(c_impl_func, c_impl_func_sp, n_wise_func, args); -#endif - conv_operation_shell(args, i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func); -} - -template <> -void conv2d(void *args_ptr) -{ - ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_func_s8_t i_impl_func = NULL; - i_impl_func_s8_t i_impl_func_sp = NULL; - c_impl_func_s8_t c_impl_func = NULL; - c_impl_func_s8_t c_impl_func_sp = NULL; - n_wise_func_s8_t n_wise_func = NULL; - -#if CONFIG_ESP32P4_BOOST - dl_esp32p4_cfg_round(ROUND_MODE_HALF_EVEN); -#endif - - if (args.filter_height == 1 && args.filter_width == 1) { - load_conv2d_11cn_s8(i_impl_func, i_impl_func_sp, args); // Filter shape = [1, 1, C, N] - } else if (args.filter_height == 3 && args.filter_width == 3) { - load_conv2d_33cn_s8(i_impl_func, i_impl_func_sp, args); // Filter shape = [3, 3, C, N] - } else { - load_conv2d_hwcn_s8(i_impl_func, i_impl_func_sp, args); // Filter shape = [H, W, C, N] - } - - if (!i_impl_func || !i_impl_func_sp) { - load_conv2d_s8_per_channel_c_func(c_impl_func, c_impl_func_sp, n_wise_func, args); - } - - conv_operation_shell(args, i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func); -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_conv2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_conv2d.hpp deleted file mode 100644 index 6b1b1355..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_conv2d.hpp +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief conv2d - * - * @tparam feature_t - * @tparam bias_t - * @tparam buffer_t - * @param args_ptr - */ -template -void conv2d(void *const args_ptr); -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_depthwise_conv2d.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_depthwise_conv2d.cpp deleted file mode 100644 index 746b24dd..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_depthwise_conv2d.cpp +++ /dev/null @@ -1,714 +0,0 @@ -#include "dl_base_depthwise_conv2d.hpp" - -#include "dl_base_activate_buffer.hpp" -#include "dl_base_activate_output.hpp" -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -inline void depthwise_conv2d_11c1(buffer_t *buffer_ptr, feature_t *input_ptr, const ArgsType &args) -{ - const feature_t *filter_element = (feature_t *)args.filter_element; - for (size_t input_c = 0; input_c < args.input_channel; input_c++) { - buffer_ptr[input_c] = input_ptr[input_c] * (*filter_element); - filter_element++; - } -} - -template -inline void depthwise_conv2d_33c1(buffer_t *buffer_ptr, feature_t *input_ptr, const ArgsType &args) -{ - // static int flag = 1; - const feature_t *filter_r0 = (feature_t *)args.filter_element; - const feature_t *filter_r1 = filter_r0 + args.filter_y_offset_c; - const feature_t *filter_r2 = filter_r1 + args.filter_y_offset_c; - feature_t *input_row_0 = input_ptr; - - for (size_t filter_x = 0; filter_x < 3; filter_x++) // W - { // - feature_t *input_row_1 = input_row_0 + args.input_dilation_y_offset; // - feature_t *input_row_2 = input_row_1 + args.input_dilation_y_offset; // - for (size_t input_c = 0; input_c < args.input_channel; input_c++) // C - { // - buffer_ptr[input_c] += input_row_0[input_c] * filter_r0[input_c]; - buffer_ptr[input_c] += input_row_1[input_c] * filter_r1[input_c]; - buffer_ptr[input_c] += input_row_2[input_c] * filter_r2[input_c]; - } - filter_r0 += args.input_channel; - filter_r1 += args.input_channel; - filter_r2 += args.input_channel; - input_row_0 += args.input_dilation_x_offset; - } - // flag = 0; -} - -template -inline void depthwise_conv2d_hwc1(buffer_t *buffer_ptr, feature_t *input_ptr, const ArgsType &args) -{ - const feature_t *filter_element = (feature_t *)args.filter_element; - for (size_t filter_y = 0; filter_y < args.filter_height; filter_y++) // H - { // - feature_t *input_yx = input_ptr; // - for (size_t filter_x = 0; filter_x < args.filter_width; filter_x++) // W - { // - for (size_t input_c = 0; input_c < args.input_channel; input_c++) // C - { // - buffer_ptr[input_c] += input_yx[input_c] * (*filter_element); - filter_element++; - } - input_yx += args.input_dilation_x_offset; - } - filter_element += args.filter_y_offset; - input_ptr += args.input_dilation_y_offset; - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize depthwise_conv2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -inline void load_depthwise_conv2d_33c1_s16(i_impl_func_s16_t &i_impl_func, - i_impl_func_s16_t &i_impl_func_sp, - c_impl_func_s16_t &c_impl_func, - c_impl_func_s16_t &c_impl_func_sp, - n_wise_func_s16_t &n_wise_func, - const ArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.output_channel % 8 == 0 && args.input_channel % 8 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_33c1_bias; - i_impl_func = dl_tie728_s16_depthwise_conv2d_hwc1_bias; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_33c1_bias_relu; - i_impl_func = dl_tie728_s16_depthwise_conv2d_hwc1_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_33c1_bias_relu; - i_impl_func = dl_tie728_s16_depthwise_conv2d_hwc1_bias_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_33c1_bias_prelu; - i_impl_func = dl_tie728_s16_depthwise_conv2d_hwc1_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_33c1; - i_impl_func = dl_tie728_s16_depthwise_conv2d_hwc1; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_33c1_relu; - i_impl_func = dl_tie728_s16_depthwise_conv2d_hwc1_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_33c1_relu; - i_impl_func = dl_tie728_s16_depthwise_conv2d_hwc1_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_33c1_prelu; - i_impl_func = dl_tie728_s16_depthwise_conv2d_hwc1_prelu; - break; - } - } - } else { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias; - i_impl_func = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu; - i_impl_func = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu; - i_impl_func = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu; - i_impl_func = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_33c1; - i_impl_func = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_33c1_relu; - i_impl_func = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_33c1_relu; - i_impl_func = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_33c1_prelu; - i_impl_func = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu; - break; - } - } - } -#else // C/C++ implementation - c_impl_func_sp = depthwise_conv2d_33c1; - c_impl_func = depthwise_conv2d_hwc1; - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_bias_linear; - break; - case ReLU: - n_wise_func = buffer_bias_relu; - break; - case LeakyReLU: - n_wise_func = buffer_bias_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_0000_linear; - break; - case ReLU: - n_wise_func = buffer_0000_relu; - break; - case LeakyReLU: - n_wise_func = buffer_0000_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_0000_prelu; - break; - } - } -#endif -} - -inline void load_depthwise_conv2d_hwc1_s16(i_impl_func_s16_t &i_impl_func, - i_impl_func_s16_t &i_impl_func_sp, - c_impl_func_s16_t &c_impl_func, - c_impl_func_s16_t &c_impl_func_sp, - n_wise_func_s16_t &n_wise_func, - const ArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.output_channel % 8 == 0 && args.input_channel % 8 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_hwc1_bias; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_hwc1_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_hwc1_bias_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_hwc1_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_hwc1; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_hwc1_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_hwc1_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_depthwise_conv2d_hwc1_prelu; - break; - } - } - } else { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1; - break; - case ReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_relu; - break; - case LeakyReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu; - break; - } - } - } - - i_impl_func = i_impl_func_sp; -#else // C/C++ implementation - c_impl_func_sp = depthwise_conv2d_hwc1; - c_impl_func = c_impl_func_sp; - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_bias_linear; - break; - case ReLU: - n_wise_func = buffer_bias_relu; - break; - case LeakyReLU: - n_wise_func = buffer_bias_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_0000_linear; - break; - case ReLU: - n_wise_func = buffer_0000_relu; - break; - case LeakyReLU: - n_wise_func = buffer_0000_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_0000_prelu; - break; - } - } -#endif -} - -template <> -void depthwise_conv2d(void *args_ptr) -{ - ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_func_s16_t i_impl_func = NULL; - i_impl_func_s16_t i_impl_func_sp = NULL; - c_impl_func_s16_t c_impl_func = NULL; - c_impl_func_s16_t c_impl_func_sp = NULL; - n_wise_func_s16_t n_wise_func = NULL; - - if (args.filter_height == 3 && args.filter_width == 3) // Filter shape = [3, 3, C, N] - { - load_depthwise_conv2d_33c1_s16(i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func, args); - } else // Filter shape = [H, W, C, N] - { - load_depthwise_conv2d_hwc1_s16(i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func, args); - } - dwconv_operation_shell( - args, i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize depthwise_conv2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -inline void load_depthwise_conv2d_33c1_s8(i_impl_func_s8_t &i_impl_func, - i_impl_func_s8_t &i_impl_func_sp, - const ArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.output_channel % 16 == 0 && args.input_channel % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_depthwise_conv2d_33c1_bias; - i_impl_func = dl_esp32p4_s8_depthwise_conv2d_hwc1_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_depthwise_conv2d_33c1_bias_relu; - i_impl_func = dl_esp32p4_s8_depthwise_conv2d_hwc1_bias_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - i_impl_func_sp = dl_esp32p4_s8_depthwise_conv2d_33c1_bias_prelu; - i_impl_func = dl_esp32p4_s8_depthwise_conv2d_hwc1_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_depthwise_conv2d_33c1; - i_impl_func = dl_esp32p4_s8_depthwise_conv2d_hwc1; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_depthwise_conv2d_33c1_relu; - i_impl_func = dl_esp32p4_s8_depthwise_conv2d_hwc1_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - i_impl_func_sp = dl_esp32p4_s8_depthwise_conv2d_33c1_prelu; - i_impl_func = dl_esp32p4_s8_depthwise_conv2d_hwc1_prelu; - break; - } - } - } else { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias; - i_impl_func = dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu; - i_impl_func = dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_conv2d_11cn_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1; - i_impl_func = dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu; - i_impl_func = dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_conv2d_11cn_prelu; - break; - } - } - } - return; -#elif CONFIG_TIE728_BOOST - if (args.output_channel % 16 == 0 && args.input_channel % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s8_depthwise_conv2d_33c1; - i_impl_func = dl_tie728_s8_depthwise_conv2d_hwc1; - break; - case ReLU: - case LeakyReLU: - i_impl_func_sp = dl_tie728_s8_depthwise_conv2d_33c1_relu; - i_impl_func = dl_tie728_s8_depthwise_conv2d_hwc1_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s8_depthwise_conv2d_33c1_prelu; - i_impl_func = dl_tie728_s8_depthwise_conv2d_hwc1_prelu; - break; - } - } else { - i_impl_func_sp = dl_tie728_s8_unaligned_depthwise_conv2d_33c1; - i_impl_func = dl_tie728_s8_unaligned_depthwise_conv2d_hwc1; - } - return; -#endif -} - -inline void load_depthwise_conv2d_hwc1_s8(i_impl_func_s8_t &i_impl_func, - i_impl_func_s8_t &i_impl_func_sp, - const ArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.output_channel % 16 == 0 && args.input_channel % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_depthwise_conv2d_hwc1_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_depthwise_conv2d_hwc1_bias_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_depthwise_conv2d_hwc1_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_depthwise_conv2d_hwc1; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_depthwise_conv2d_hwc1_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_depthwise_conv2d_hwc1_prelu; - break; - } - } - } else { - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1; - break; - case ReLU: - i_impl_func_sp = dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu; - break; - case LeakyReLU: - // Don't be supported now. - break; - case PReLU: - // i_impl_func_sp = dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_prelu; - break; - } - } - } - i_impl_func = i_impl_func_sp; - return; -#elif CONFIG_TIE728_BOOST - if (args.output_channel % 16 == 0 && args.input_channel % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - i_impl_func_sp = dl_tie728_s8_depthwise_conv2d_hwc1; - break; - case ReLU: - case LeakyReLU: - i_impl_func_sp = dl_tie728_s8_depthwise_conv2d_hwc1_relu; - break; - case PReLU: - i_impl_func_sp = dl_tie728_s8_depthwise_conv2d_hwc1_prelu; - break; - } - } else { - i_impl_func_sp = dl_tie728_s8_unaligned_depthwise_conv2d_hwc1; - } - i_impl_func = i_impl_func_sp; - return; -#endif -} - -inline void load_depthwise_conv2d_s8_per_tensor_c_func(c_impl_func_s8_t &c_impl_func, - c_impl_func_s8_t &c_impl_func_sp, - n_wise_func_s8_t &n_wise_func, - const ArgsType &args) -{ - if (args.filter_height == 3 && args.filter_width == 3) // Filter shape = [3, 3, C, N] - { - c_impl_func_sp = depthwise_conv2d_33c1; - c_impl_func = depthwise_conv2d_hwc1; - } else // Filter shape = [H, W, C, N] - { - c_impl_func_sp = depthwise_conv2d_hwc1; - c_impl_func = c_impl_func_sp; - } - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_bias_linear; - break; - case ReLU: - n_wise_func = buffer_bias_relu; - break; - case LeakyReLU: - n_wise_func = buffer_bias_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_0000_linear; - break; - case ReLU: - n_wise_func = buffer_0000_relu; - break; - case LeakyReLU: - n_wise_func = buffer_0000_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_0000_prelu; - break; - } - } -} - -inline void load_depthwise_conv2d_s8_per_channel_c_func(c_impl_func_s8_t &c_impl_func, - c_impl_func_s8_t &c_impl_func_sp, - n_wise_func_s8_t &n_wise_func, - const ArgsType &args) -{ - if (args.filter_height == 3 && args.filter_width == 3) // Filter shape = [3, 3, C, N] - { - c_impl_func_sp = depthwise_conv2d_33c1; - c_impl_func = depthwise_conv2d_hwc1; - } else // Filter shape = [H, W, C, N] - { - c_impl_func_sp = depthwise_conv2d_hwc1; - c_impl_func = c_impl_func_sp; - } - if (args.bias_element) { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_bias_linear; - break; - case ReLU: - n_wise_func = buffer_bias_relu; - break; - case LeakyReLU: - n_wise_func = buffer_bias_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_bias_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - n_wise_func = buffer_0000_linear; - break; - case ReLU: - n_wise_func = buffer_0000_relu; - break; - case LeakyReLU: - n_wise_func = buffer_0000_leakyrelu; - break; - case PReLU: - n_wise_func = buffer_0000_prelu; - break; - } - } -} - -template <> -void depthwise_conv2d(void *args_ptr) -{ - ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_func_s8_t i_impl_func = NULL; - i_impl_func_s8_t i_impl_func_sp = NULL; - c_impl_func_s8_t c_impl_func = NULL; - c_impl_func_s8_t c_impl_func_sp = NULL; - n_wise_func_s8_t n_wise_func = NULL; - -#if CONFIG_TIE728_BOOST - if (args.filter_height == 3 && args.filter_width == 3) // Filter shape = [3, 3, C, N] - { - load_depthwise_conv2d_33c1_s8(i_impl_func, i_impl_func_sp, args); - } else // Filter shape = [H, W, C, N] - { - load_depthwise_conv2d_hwc1_s8(i_impl_func, i_impl_func_sp, args); - } -#else - load_depthwise_conv2d_s8_per_tensor_c_func(c_impl_func, c_impl_func_sp, n_wise_func, args); -#endif - dwconv_operation_shell( - args, i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func); -} - -template <> -void depthwise_conv2d(void *args_ptr) -{ - ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_func_s8_t i_impl_func = NULL; - i_impl_func_s8_t i_impl_func_sp = NULL; - c_impl_func_s8_t c_impl_func = NULL; - c_impl_func_s8_t c_impl_func_sp = NULL; - n_wise_func_s8_t n_wise_func = NULL; - -#if CONFIG_TIE728_BOOST - if (args.filter_height == 3 && args.filter_width == 3) // Filter shape = [3, 3, C, N] - { - load_depthwise_conv2d_33c1_s8(i_impl_func, i_impl_func_sp, args); - } else // Filter shape = [H, W, C, N] - { - load_depthwise_conv2d_hwc1_s8(i_impl_func, i_impl_func_sp, args); - } -#else - load_depthwise_conv2d_s8_per_channel_c_func(c_impl_func, c_impl_func_sp, n_wise_func, args); -#endif - dwconv_operation_shell( - args, i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func); -} - -template <> -void depthwise_conv2d(void *args_ptr) -{ - ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_func_s8_t i_impl_func = NULL; - i_impl_func_s8_t i_impl_func_sp = NULL; - c_impl_func_s8_t c_impl_func = NULL; - c_impl_func_s8_t c_impl_func_sp = NULL; - n_wise_func_s8_t n_wise_func = NULL; - -#if CONFIG_ESP32P4_BOOST - dl_esp32p4_cfg_round(ROUND_MODE_HALF_EVEN); -#endif - - if (args.filter_height == 3 && args.filter_width == 3) { - load_depthwise_conv2d_33c1_s8(i_impl_func, i_impl_func_sp, args); // Filter shape = [3, 3, C, N] - } else { - load_depthwise_conv2d_hwc1_s8(i_impl_func, i_impl_func_sp, args); // Filter shape = [H, W, C, N] - } - - if (!i_impl_func || !i_impl_func_sp) { - load_depthwise_conv2d_s8_per_channel_c_func(c_impl_func, c_impl_func_sp, n_wise_func, args); - } - dwconv_operation_shell( - args, i_impl_func, i_impl_func_sp, c_impl_func, c_impl_func_sp, n_wise_func); -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_depthwise_conv2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_depthwise_conv2d.hpp deleted file mode 100644 index a90abfd6..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_depthwise_conv2d.hpp +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief - * NOTE: support [H, W, C, 1] only by now - * NOTE: in tensorflow, when dilation > 1 the stride must be 1. Our api has no such limitation. But we didn't test this - * oppsite situation. https://tensorflow.google.cn/api_docs/python/tf/nn/depthwise_conv2d - * TODO: support [H, W, C, M] take tensorflow for reference - * - * NOTE: support: feature_t == filter_t == bias_t == activate_t == buffer_t == int16_t - * TODO: support: feature_t == filter_t == bias_t == activate_t == buffer_t == int8_t - * - * @tparam feature_t - * @tparam bias_t - * @tparam buffer_t - * @param args_ptr - */ -template -void depthwise_conv2d(void *args_ptr); -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_leakyrelu.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_leakyrelu.cpp deleted file mode 100644 index 17ab0630..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_leakyrelu.cpp +++ /dev/null @@ -1,87 +0,0 @@ -#include "dl_base_leakyrelu.hpp" - -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -void leakyrelu_11c(feature_t *output_ptr, feature_t *input_ptr, const ArgsType &args) -{ - buffer_t temp; - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - output_ptr[output_c] = input_ptr[output_c]; - if (output_ptr[output_c] < 0) { - temp = - DL_RIGHT_SHIFT((buffer_t)output_ptr[output_c] * (buffer_t)args.activation_alpha, args.activation_shift); - tool::truncate(output_ptr[output_c], temp); - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize leakyrelu -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -inline void load_leakyrelu_11cn_s16(i_impl_acti_s16_t &i_impl_func, - c_impl_acti_s16_t &c_impl_func, - const ArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.input_stride_x_offset % 8 == 0 && args.output_x_offset % 8 == 0 && - !((unsigned)&args.input_element[0] & 15) && !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s16_relu_11c; - } else { - i_impl_func = dl_tie728_s16_unaligned_relu_11c; - } - -#else - c_impl_func = leakyrelu_11c; -#endif // CONFIG_TIE_BOOST -} - -template <> -void leakyrelu(void *const args_ptr) -{ - const ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_acti_s16_t i_impl_func = NULL; - c_impl_acti_s16_t c_impl_func = NULL; - - load_leakyrelu_11cn_s16(i_impl_func, c_impl_func, args); - - activation_shell(args, i_impl_func, c_impl_func); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize leakyrelu -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -inline void load_leakyrelu_11cn_s8(i_impl_acti_s8_t &i_impl_func, - c_impl_acti_s8_t &c_impl_func, - const ArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.input_stride_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && - !((unsigned)&args.input_element[0] & 15) && !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s8_relu_11c; - } else { - i_impl_func = dl_tie728_s8_unaligned_relu_11c; - } -#else - c_impl_func = leakyrelu_11c; -#endif // CONFIG_TIE_BOOST -} - -template <> -void leakyrelu(void *const args_ptr) -{ - const ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_acti_s8_t i_impl_func = NULL; - c_impl_acti_s8_t c_impl_func = NULL; - - load_leakyrelu_11cn_s8(i_impl_func, c_impl_func, args); - - activation_shell(args, i_impl_func, c_impl_func); -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_leakyrelu.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_leakyrelu.hpp deleted file mode 100644 index 73eb444b..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_leakyrelu.hpp +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief leakyrelu - * - * @tparam feature_t - */ -template -void leakyrelu(void *const args_ptr); -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_max2d.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_max2d.cpp deleted file mode 100644 index 2b58fb7f..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_max2d.cpp +++ /dev/null @@ -1,93 +0,0 @@ -#include "dl_base_max2d.hpp" - -#include "dl_base_activate_output.hpp" -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -inline void max2d_11c(feature_t *output_ptr, - feature_t *input0_ptr, - feature_t *input1_ptr, - const arithArgsType &args) -{ - for (size_t output_c = 0; output_c < args.channel; output_c++) // C - { - output_ptr[output_c] = DL_MAX(input0_ptr[output_c], input1_ptr[output_c]); - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize max2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -inline void load_max2d_11c_s16(arith_i_impl_func_s16_t &i_impl_func, - arith_c_impl_func_s16_t &c_impl_func, - arith_n_wise_tail_s16_t &n_wise_tail, - const arithArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.input0_x_offset % 8 == 0 && args.input1_x_offset % 8 == 0 && args.output_x_offset % 8 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s16_max2d_11c; - } else { - i_impl_func = dl_tie728_s16_unaligned_max2d_11c; - } - -#else - c_impl_func = max2d_11c; -#endif // CONFIG_TIE728_BOOST -} - -template <> -void max2d(void *const args_ptr) -{ - const arithArgsType &args = *((arithArgsType *)args_ptr); - - arith_i_impl_func_s16_t i_impl_func = NULL; - arith_c_impl_func_s16_t c_impl_func = NULL; - arith_n_wise_tail_s16_t n_wise_tail = NULL; - - load_max2d_11c_s16(i_impl_func, c_impl_func, n_wise_tail, args); - - arith_operation_shell(args, i_impl_func, c_impl_func, n_wise_tail); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize max2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -inline void load_max2d_11c_s8(arith_i_impl_func_s8_t &i_impl_func, - arith_c_impl_func_s8_t &c_impl_func, - arith_n_wise_tail_s8_t &n_wise_tail, - const arithArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.input0_x_offset % 16 == 0 && args.input1_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s8_max2d_11c; - } else { - i_impl_func = dl_tie728_s8_unaligned_max2d_11c; - } -#else - c_impl_func = max2d_11c; -#endif // CONFIG_TIE728_BOOST -} - -template <> -void max2d(void *const args_ptr) -{ - const arithArgsType &args = *((arithArgsType *)args_ptr); - - arith_i_impl_func_s8_t i_impl_func = NULL; - arith_c_impl_func_s8_t c_impl_func = NULL; - arith_n_wise_tail_s8_t n_wise_tail = NULL; - - load_max2d_11c_s8(i_impl_func, c_impl_func, n_wise_tail, args); - - arith_operation_shell(args, i_impl_func, c_impl_func, n_wise_tail); -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_max2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_max2d.hpp deleted file mode 100644 index 8007834c..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_max2d.hpp +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief max2d - * - * @tparam feature_t - * @param args_ptr - */ -template -void max2d(void *const args_ptr); - -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_max_pool2d.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_max_pool2d.cpp deleted file mode 100644 index 1b56a68d..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_max_pool2d.cpp +++ /dev/null @@ -1,102 +0,0 @@ -#include "dl_base_max_pool2d.hpp" - -#include "dl_base_activate_buffer.hpp" -#include "dl_base_activate_output.hpp" -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -inline void max_pool2d_hwc1(buffer_t *buffer_ptr, - feature_t *input_ptr, - feature_t *output_ptr, - PoolArgsType &args) -{ - for (size_t input_c = 0; input_c < args.input_channel; input_c++) { - buffer_ptr[input_c] = input_ptr[input_c]; - } - - for (size_t filter_y = 0; filter_y < args.filter_height; filter_y++) // H - { // - feature_t *input_yx = input_ptr; - for (size_t filter_x = 0; filter_x < args.filter_width; filter_x++) // W - { // - for (size_t input_c = 0; input_c < args.input_channel; input_c++) // C - { // - buffer_ptr[input_c] = DL_MAX(input_yx[input_c], buffer_ptr[input_c]); - } - input_yx += args.input_x_offset; - } - input_ptr += args.input_y_offset; - } - - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - tool::truncate(output_ptr[output_c], buffer_ptr[output_c]); - buffer_ptr[output_c] = 0; - } -} - -template -void max_pool2d(void *args_ptr) -{ - const PoolArgsType &args = *((PoolArgsType *)args_ptr); - - max_pool_shell(args, NULL, NULL, max_pool2d_hwc1); -} - -template <> -void max_pool2d(void *args_ptr) -{ - PoolArgsType &args = *((PoolArgsType *)args_ptr); - - i_impl_func_s16_t i_impl_func = NULL; - i_impl_func_s16_t i_impl_func_sp = NULL; - pool_c_impl_func_s16_t c_impl_func = NULL; - -#if CONFIG_TIE728_BOOST - if (args.input_x_offset % 8 == 0 && args.output_x_offset % 8 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s16_max_pool2d_hwc1; - i_impl_func_sp = (args.filter_height == 2 && args.filter_width == 2) ? dl_tie728_s16_max_pool2d_22c1 - : dl_tie728_s16_max_pool2d_hwc1; - } else { - i_impl_func = dl_tie728_s16_unaligned_max_pool2d_hwc1; - i_impl_func_sp = (args.filter_height == 2 && args.filter_width == 2) ? dl_tie728_s16_unaligned_max_pool2d_22c1 - : dl_tie728_s16_unaligned_max_pool2d_hwc1; - } -#else - c_impl_func = max_pool2d_hwc1; -#endif - - max_pool_shell(args, i_impl_func, i_impl_func_sp, c_impl_func); -} - -template <> -void max_pool2d(void *args_ptr) -{ - PoolArgsType &args = *((PoolArgsType *)args_ptr); - - i_impl_func_s8_t i_impl_func = NULL; - i_impl_func_s8_t i_impl_func_sp = NULL; - pool_c_impl_func_s8_t c_impl_func = NULL; - -#if CONFIG_TIE728_BOOST - if (args.input_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s8_max_pool2d_hwc1; - i_impl_func_sp = (args.filter_height == 2 && args.filter_width == 2) ? dl_tie728_s8_max_pool2d_22c1 - : dl_tie728_s8_max_pool2d_hwc1; - } else { - i_impl_func = dl_tie728_s8_unaligned_max_pool2d_hwc1; - i_impl_func_sp = (args.filter_height == 2 && args.filter_width == 2) ? dl_tie728_s8_unaligned_max_pool2d_22c1 - : dl_tie728_s8_unaligned_max_pool2d_hwc1; - } -#else - c_impl_func = max_pool2d_hwc1; -#endif - - max_pool_shell(args, i_impl_func, i_impl_func_sp, c_impl_func); -} - -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_max_pool2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_max_pool2d.hpp deleted file mode 100644 index d90138bd..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_max_pool2d.hpp +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#include "dl_base_pool2d.hpp" - -namespace dl { -namespace base { -/** - * @brief - * - * @tparam feature_t - * @tparam filter_t - * @tparam buffer_t - * @param args_ptr - */ -template -void max_pool2d(void *args_ptr); -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_min2d.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_min2d.cpp deleted file mode 100644 index edcfe76d..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_min2d.cpp +++ /dev/null @@ -1,93 +0,0 @@ -#include "dl_base_min2d.hpp" - -#include "dl_base_activate_output.hpp" -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -inline void min2d_11c(feature_t *output_ptr, - feature_t *input0_ptr, - feature_t *input1_ptr, - const arithArgsType &args) -{ - for (size_t output_c = 0; output_c < args.channel; output_c++) // C - { - output_ptr[output_c] = DL_MIN(input0_ptr[output_c], input1_ptr[output_c]); - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize min2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -inline void load_min2d_11c_s16(arith_i_impl_func_s16_t &i_impl_func, - arith_c_impl_func_s16_t &c_impl_func, - arith_n_wise_tail_s16_t &n_wise_tail, - const arithArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.input0_x_offset % 8 == 0 && args.input1_x_offset % 8 == 0 && args.output_x_offset % 8 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s16_min2d_11c; - } else { - i_impl_func = dl_tie728_s16_unaligned_min2d_11c; - } - -#else - c_impl_func = min2d_11c; -#endif // CONFIG_TIE728_BOOST -} - -template <> -void min2d(void *const args_ptr) -{ - const arithArgsType &args = *((arithArgsType *)args_ptr); - - arith_i_impl_func_s16_t i_impl_func = NULL; - arith_c_impl_func_s16_t c_impl_func = NULL; - arith_n_wise_tail_s16_t n_wise_tail = NULL; - - load_min2d_11c_s16(i_impl_func, c_impl_func, n_wise_tail, args); - - arith_operation_shell(args, i_impl_func, c_impl_func, n_wise_tail); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize min2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -inline void load_min2d_11c_s8(arith_i_impl_func_s8_t &i_impl_func, - arith_c_impl_func_s8_t &c_impl_func, - arith_n_wise_tail_s8_t &n_wise_tail, - const arithArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.input0_x_offset % 16 == 0 && args.input1_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s8_min2d_11c; - } else { - i_impl_func = dl_tie728_s8_unaligned_min2d_11c; - } -#else - c_impl_func = min2d_11c; -#endif // CONFIG_TIE728_BOOST -} - -template <> -void min2d(void *const args_ptr) -{ - const arithArgsType &args = *((arithArgsType *)args_ptr); - - arith_i_impl_func_s8_t i_impl_func = NULL; - arith_c_impl_func_s8_t c_impl_func = NULL; - arith_n_wise_tail_s8_t n_wise_tail = NULL; - - load_min2d_11c_s8(i_impl_func, c_impl_func, n_wise_tail, args); - - arith_operation_shell(args, i_impl_func, c_impl_func, n_wise_tail); -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_min2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_min2d.hpp deleted file mode 100644 index 991be92c..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_min2d.hpp +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief min2d - * - * @tparam feature_t - * @param args_ptr - */ -template -void min2d(void *const args_ptr); - -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_mul2d.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_mul2d.cpp deleted file mode 100644 index f1c7d71a..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_mul2d.cpp +++ /dev/null @@ -1,233 +0,0 @@ -#include "dl_base_mul2d.hpp" - -#include "dl_base_activate_output.hpp" -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -inline void mul2d_11c(feature_t *output_ptr, - feature_t *input0_ptr, - feature_t *input1_ptr, - const arithArgsType &args) -{ - buffer_t buffer; - for (size_t output_c = 0; output_c < args.channel; output_c++) // C - { - buffer = (buffer_t)input0_ptr[output_c] * (buffer_t)input1_ptr[output_c]; - buffer = DL_RIGHT_SHIFT(buffer, args.mul_shift); - tool::truncate(output_ptr[output_c], buffer); - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize mul2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -inline void load_mul2d_11c_s16(arith_i_impl_func_s16_t &i_impl_func, - arith_c_impl_func_s16_t &c_impl_func, - arith_n_wise_tail_s16_t &n_wise_tail, - const arithArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.input0_x_offset % 8 == 0 && args.input1_x_offset % 8 == 0 && args.output_x_offset % 8 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_esp32p4_s16_mul2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_esp32p4_s16_mul2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_esp32p4_s16_mul2d_11c_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_esp32p4_s16_unaligned_mul2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_esp32p4_s16_unaligned_mul2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_esp32p4_s16_unaligned_mul2d_11c_prelu; - break; - } - } -#elif CONFIG_TIE728_BOOST - if (args.input0_x_offset % 8 == 0 && args.input1_x_offset % 8 == 0 && args.output_x_offset % 8 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_tie728_s16_mul2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_tie728_s16_mul2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_tie728_s16_mul2d_11c_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_tie728_s16_unaligned_mul2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_tie728_s16_unaligned_mul2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_tie728_s16_unaligned_mul2d_11c_prelu; - break; - } - } -#else - c_impl_func = mul2d_11c; - - switch (args.activation_type) { - case Linear: - n_wise_tail = NULL; - break; - case ReLU: - n_wise_tail = arith_output_relu; - break; - case LeakyReLU: - n_wise_tail = arith_output_leakyrelu; - break; - case PReLU: - n_wise_tail = arith_output_prelu; - break; - } -#endif // CONFIG_TIE728_BOOST -} - -template <> -void mul2d(void *const args_ptr) -{ - const arithArgsType &args = *((arithArgsType *)args_ptr); - - arith_i_impl_func_s16_t i_impl_func = NULL; - arith_c_impl_func_s16_t c_impl_func = NULL; - arith_n_wise_tail_s16_t n_wise_tail = NULL; - -#if CONFIG_ESP32P4_BOOST - dl_esp32p4_cfg_round(ROUND_MODE_HALF_EVEN); -#endif - load_mul2d_11c_s16(i_impl_func, c_impl_func, n_wise_tail, args); - - arith_operation_shell(args, i_impl_func, c_impl_func, n_wise_tail); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize mul2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -inline void load_mul2d_11c_s8(arith_i_impl_func_s8_t &i_impl_func, - arith_c_impl_func_s8_t &c_impl_func, - arith_n_wise_tail_s8_t &n_wise_tail, - const arithArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.input0_x_offset % 16 == 0 && args.input1_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_esp32p4_s8_mul2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_esp32p4_s8_mul2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_esp32p4_s8_mul2d_11c_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_esp32p4_s8_unaligned_mul2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_esp32p4_s8_unaligned_mul2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_esp32p4_s8_unaligned_mul2d_11c_prelu; - break; - } - } -#elif CONFIG_TIE728_BOOST - if (args.input0_x_offset % 16 == 0 && args.input1_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_tie728_s8_mul2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_tie728_s8_mul2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_tie728_s8_mul2d_11c_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_tie728_s8_unaligned_mul2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_tie728_s8_unaligned_mul2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_tie728_s8_unaligned_mul2d_11c_prelu; - break; - } - } -#else - c_impl_func = mul2d_11c; - - switch (args.activation_type) { - case Linear: - n_wise_tail = NULL; - break; - case ReLU: - n_wise_tail = arith_output_relu; - break; - case LeakyReLU: - n_wise_tail = arith_output_leakyrelu; - break; - case PReLU: - n_wise_tail = arith_output_prelu; - break; - } -#endif // CONFIG_TIE728_BOOST -} - -template <> -void mul2d(void *const args_ptr) -{ - const arithArgsType &args = *((arithArgsType *)args_ptr); - - arith_i_impl_func_s8_t i_impl_func = NULL; - arith_c_impl_func_s8_t c_impl_func = NULL; - arith_n_wise_tail_s8_t n_wise_tail = NULL; - -#if CONFIG_ESP32P4_BOOST - dl_esp32p4_cfg_round(ROUND_MODE_HALF_EVEN); -#endif - load_mul2d_11c_s8(i_impl_func, c_impl_func, n_wise_tail, args); - - arith_operation_shell(args, i_impl_func, c_impl_func, n_wise_tail); -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_mul2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_mul2d.hpp deleted file mode 100644 index 255009f3..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_mul2d.hpp +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief mul2d - * - * @tparam feature_t - * @param args_ptr - */ -template -void mul2d(void *const args_ptr); - -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_pool2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_pool2d.hpp deleted file mode 100644 index 75b3d88d..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_pool2d.hpp +++ /dev/null @@ -1,800 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief - * - * @tparam feature_t - */ -template -struct PoolArgsType { - feature_t *input_element; /*> - */ -template -std::vector> get_pool_args(Tensor &output, - Tensor &input, - std::vector &padding, - std::vector &filter_shape, - const int stride_y, - const int stride_x, - const int core_number = 1) -{ - PoolArgsType args; - // args.input_element = input.get_element_ptr(padding); - args.input_element = input.get_element_ptr(); - args.input_height = input.shape[0]; - args.input_width = input.shape[1]; - args.input_channel = input.shape[2]; - args.input_stride_y_offset = input.shape[1] * input.shape[2] * stride_y; - args.input_stride_x_offset = input.shape[2] * stride_x; - args.input_y_offset = input.shape[1] * input.shape[2]; - args.input_x_offset = input.shape[2]; - args.input_y_offset_bytes = args.input_y_offset * sizeof(feature_t); - args.input_x_offset_bytes = args.input_x_offset * sizeof(feature_t); - args.input_exponent = input.exponent; - - args.output_element = output.get_element_ptr(); - args.output_height = output.shape[0]; - args.output_width = output.shape[1]; - args.output_channel = output.shape[2]; - args.output_y_offset = output.shape[1] * output.shape[2]; - args.output_x_offset = output.shape[2]; - args.output_exponent = output.exponent; - - args.filter_height = filter_shape[0]; - args.filter_width = filter_shape[1]; - args.avg_pool_area = args.filter_height * args.filter_width; - int max_value = INT_MAX; -#if CONFIG_ESP32P4_BOOST - if (sizeof(feature_t) == 1) { - max_value = 127; - } else if (sizeof(feature_t) == 2) { - max_value = 32767; - } -#else - if (sizeof(feature_t) == 1) { - max_value = 64; - } else if (sizeof(feature_t) == 2) { - max_value = 16384; - } -#endif - args.pool_exponent = -tool::calculate_exponent(args.filter_height * args.filter_width, max_value); - args.mac_shift = output.exponent - args.pool_exponent - input.exponent; - - // for ISA - int u = 16 / sizeof(feature_t); - args.c_remainder = (input.shape[2] % u) * sizeof(feature_t); -#if CONFIG_ESP32P4_BOOST - args.avg_pool_area_inv = tool::round(1.f / (args.filter_height * args.filter_width) * (1 << (-args.pool_exponent))); -#else - args.avg_pool_area_inv = (1 << (-args.pool_exponent)) / (args.filter_height * args.filter_width); -#endif - int c_div_x = input.shape[2] / u; - if (args.c_remainder != 0 && args.input_x_offset % u == 0 && args.output_x_offset % u == 0 && - !((unsigned)&args.input_element[0] & 15) && !((unsigned)&args.output_element[0] & 15)) { - c_div_x += 1; - } - args.c_div_x_1 = c_div_x - 1; - - args.padding_h_head = padding[0]; - args.padding_h_tail = padding[1]; - args.padding_w_head = padding[2]; - args.padding_w_tail = padding[3]; - - args.stride_x = stride_x; - args.stride_y = stride_y; - // slice - std::vector> m_args(core_number, args); - if (core_number > 1) { - int output_y_slice = output.shape[0] / core_number; - int output_y_remained = output.shape[0]; - - // first slice - m_args[0].output_height = output_y_slice; - output_y_remained -= output_y_slice; - - // between slice - for (size_t i = 1; i < core_number - 1; i++) { - m_args[i].input_element = - m_args[i - 1].input_element + m_args[i - 1].output_height * args.input_stride_y_offset; - m_args[i].output_element = - m_args[i - 1].output_element + m_args[i - 1].output_height * args.output_y_offset; - m_args[i].output_height = output_y_slice; - output_y_remained -= output_y_slice; - } - - // last slice - m_args.back().input_element = - m_args[core_number - 2].input_element + m_args[core_number - 2].output_height * args.input_stride_y_offset; - m_args.back().output_element = - m_args[core_number - 2].output_element + m_args[core_number - 2].output_height * args.output_y_offset; - m_args.back().output_height = output_y_remained; - } - - return m_args; -} - -template -std::vector> get_pool_args(TensorBase *output, - TensorBase *input, - const std::vector &padding, - const std::vector &filter_shape, - const int stride_y, - const int stride_x, - const runtime_mode_t runtime_mode = RUNTIME_MODE_AUTO) -{ - PoolArgsType args; - // args.input_element = input->get_element_ptr(padding); - args.input_element = (feature_t *)input->get_element_ptr(); - args.input_height = input->shape[1]; - args.input_width = input->shape[2]; - args.input_channel = input->shape[3]; - args.input_stride_y_offset = input->shape[2] * input->shape[3] * stride_y; - args.input_stride_x_offset = input->shape[3] * stride_x; - args.input_y_offset = input->shape[2] * input->shape[3]; - args.input_x_offset = input->shape[3]; - args.input_y_offset_bytes = args.input_y_offset * sizeof(feature_t); - args.input_x_offset_bytes = args.input_x_offset * sizeof(feature_t); - args.input_exponent = input->exponent; - - args.output_element = (feature_t *)output->get_element_ptr(); - args.output_height = output->shape[1]; - args.output_width = output->shape[2]; - args.output_channel = output->shape[3]; - args.output_y_offset = output->shape[2] * output->shape[3]; - args.output_x_offset = output->shape[3]; - args.output_exponent = output->exponent; - - args.filter_height = filter_shape[0]; - args.filter_width = filter_shape[1]; - args.avg_pool_area = args.filter_height * args.filter_width; - int max_value = INT_MAX; -#if CONFIG_ESP32P4_BOOST - if (sizeof(feature_t) == 1) { - max_value = 127; - } else if (sizeof(feature_t) == 2) { - max_value = 32767; - } -#else - if (sizeof(feature_t) == 1) { - max_value = 64; - } else if (sizeof(feature_t) == 2) { - max_value = 16384; - } -#endif - args.pool_exponent = -tool::calculate_exponent(args.filter_height * args.filter_width, max_value); - args.mac_shift = output->exponent - args.pool_exponent - input->exponent; - - // for ISA - int u = 16 / sizeof(feature_t); - args.c_remainder = (input->shape[3] % u) * sizeof(feature_t); -#if CONFIG_ESP32P4_BOOST - args.avg_pool_area_inv = tool::round(1.f / (args.filter_height * args.filter_width) * (1 << (-args.pool_exponent))); -#else - args.avg_pool_area_inv = (1 << (-args.pool_exponent)) / (args.filter_height * args.filter_width); -#endif - int c_div_x = input->shape[3] / u; - if (args.c_remainder != 0 && args.input_x_offset % u == 0 && args.output_x_offset % u == 0 && - !((unsigned)&args.input_element[0] & 15) && !((unsigned)&args.output_element[0] & 15)) { - c_div_x += 1; - } - args.c_div_x_1 = c_div_x - 1; - - args.padding_h_head = padding[0]; - args.padding_h_tail = padding[1]; - args.padding_w_head = padding[2]; - args.padding_w_tail = padding[3]; - - args.stride_x = stride_x; - args.stride_y = stride_y; - // slice - std::vector> m_args(1, args); - if (runtime_mode == RUNTIME_MODE_MULTI_CORE) { - } - - return m_args; -} - -typedef void (*pool_c_impl_func_s16_t)(int64_t *, int16_t *, int16_t *, PoolArgsType &); -typedef void (*pool_c_impl_func_s8_t)(int32_t *, int8_t *, int8_t *, PoolArgsType &); - -typedef void (*avg_pool_c_impl_func_s16_t)(float *, int16_t *, int16_t *, PoolArgsType &); -typedef void (*avg_pool_c_impl_func_s8_t)(float *, int8_t *, int8_t *, PoolArgsType &); - -/** - * @brief - * - * @tparam feature_t - * @tparam buffer_t - * @param args - * @param i_impl_func - * @param i_impl_func_sp - * @param c_impl_func - * @param n_wise_tail - */ -template -void avg_pool_shell(PoolArgsType &args, - void (*i_impl_func)(feature_t *, feature_t *, void *), - void (*i_impl_func_sp)(feature_t *, feature_t *, void *), - void (*c_impl_func)(buffer_t *, feature_t *, feature_t *, PoolArgsType &)) -{ - feature_t *input_ptr_real = (feature_t *)args.input_element; - feature_t *output_ptr = (feature_t *)args.output_element; - int n_h_head = (args.padding_h_head + args.stride_y - 1) / args.stride_y; - int n_w_head = (args.padding_w_head + args.stride_x - 1) / args.stride_x; - int n_h_body = (args.input_height + args.padding_h_head - args.filter_height) / args.stride_y + 1 - n_h_head; - int n_w_body = (args.input_width + args.padding_w_head - args.filter_width) / args.stride_x + 1 - n_w_head; - int n_h_tail = args.output_height - n_h_head - n_h_body; - int n_w_tail = args.output_width - n_w_head - n_w_body; - int filter_h = args.filter_height; - int filter_w = args.filter_width; - int new_pool_exponent = args.pool_exponent; - int max_value = INT_MAX; -#if CONFIG_ESP32P4_BOOST - if (sizeof(feature_t) == 1) { - max_value = 127; - } else if (sizeof(feature_t) == 2) { - max_value = 32767; - } -#else - if (sizeof(feature_t) == 1) { - max_value = 64; - } else if (sizeof(feature_t) == 2) { - max_value = 16384; - } -#endif - -#if !CONFIG_ACCURATE_INFER - if (i_impl_func && i_impl_func_sp) { - feature_t *input_syx_real = input_ptr_real; - feature_t *output_yx = output_ptr; - for (size_t output_y = 0; output_y < n_h_head; output_y++) { - output_yx = output_ptr; - input_syx_real = input_ptr_real; - args.filter_height = filter_h - args.padding_h_head + output_y * args.stride_y; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - args.padding_w_head + output_x * args.stride_x; - new_pool_exponent = -tool::calculate_exponent(args.filter_height * args.filter_width, max_value); - args.mac_shift = args.mac_shift + args.pool_exponent - new_pool_exponent; - args.pool_exponent = new_pool_exponent; -#if CONFIG_ESP32P4_BOOST - args.avg_pool_area_inv = - tool::round(1.f / (args.filter_height * args.filter_width) * (1 << (-args.pool_exponent))); -#else - args.avg_pool_area_inv = (1 << (-args.pool_exponent)) / (args.filter_height * args.filter_width); -#endif - i_impl_func(output_yx, input_syx_real, (void *const)&args); - output_yx += args.output_x_offset; - } - - input_syx_real += (args.stride_x * n_w_head - args.padding_w_head) * args.input_x_offset; - args.filter_width = filter_w; - new_pool_exponent = -tool::calculate_exponent(args.filter_height * args.filter_width, max_value); - args.mac_shift = args.mac_shift + args.pool_exponent - new_pool_exponent; - args.pool_exponent = new_pool_exponent; -#if CONFIG_ESP32P4_BOOST - args.avg_pool_area_inv = - tool::round(1.f / (args.filter_height * args.filter_width) * (1 << (-args.pool_exponent))); -#else - args.avg_pool_area_inv = (1 << (-args.pool_exponent)) / (args.filter_height * args.filter_width); -#endif - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func(output_yx, input_syx_real, (void *const)&args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = filter_w - args.padding_w_tail + (n_w_tail - 1 - output_x) * args.stride_x; - new_pool_exponent = -tool::calculate_exponent(args.filter_height * args.filter_width, max_value); - args.mac_shift = args.mac_shift + args.pool_exponent - new_pool_exponent; - args.pool_exponent = new_pool_exponent; -#if CONFIG_ESP32P4_BOOST - args.avg_pool_area_inv = - tool::round(1.f / (args.filter_height * args.filter_width) * (1 << (-args.pool_exponent))); -#else - args.avg_pool_area_inv = (1 << (-args.pool_exponent)) / (args.filter_height * args.filter_width); -#endif - i_impl_func(output_yx, input_syx_real, (void *const)&args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - output_ptr += args.output_y_offset; - } - - input_ptr_real += (args.stride_y * n_h_head - args.padding_h_head) * args.input_y_offset; - args.filter_height = filter_h; - - for (size_t output_y = 0; output_y < n_h_body; output_y++) { - input_syx_real = input_ptr_real; - output_yx = output_ptr; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - args.padding_w_head + output_x * args.stride_x; - new_pool_exponent = -tool::calculate_exponent(args.filter_height * args.filter_width, max_value); - args.mac_shift = args.mac_shift + args.pool_exponent - new_pool_exponent; - args.pool_exponent = new_pool_exponent; -#if CONFIG_ESP32P4_BOOST - args.avg_pool_area_inv = - tool::round(1.f / (args.filter_height * args.filter_width) * (1 << (-args.pool_exponent))); -#else - args.avg_pool_area_inv = (1 << (-args.pool_exponent)) / (args.filter_height * args.filter_width); -#endif - i_impl_func(output_yx, input_syx_real, (void *const)&args); - output_yx += args.output_x_offset; - } - - input_syx_real += (args.stride_x * n_w_head - args.padding_w_head) * args.input_x_offset; - args.filter_width = filter_w; - new_pool_exponent = -tool::calculate_exponent(args.filter_height * args.filter_width, max_value); - args.mac_shift = args.mac_shift + args.pool_exponent - new_pool_exponent; - args.pool_exponent = new_pool_exponent; -#if CONFIG_ESP32P4_BOOST - args.avg_pool_area_inv = - tool::round(1.f / (args.filter_height * args.filter_width) * (1 << (-args.pool_exponent))); -#else - args.avg_pool_area_inv = (1 << (-args.pool_exponent)) / (args.filter_height * args.filter_width); -#endif - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func_sp(output_yx, input_syx_real, (void *const)&args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = filter_w - args.padding_w_tail + (n_w_tail - 1 - output_x) * args.stride_x; - new_pool_exponent = -tool::calculate_exponent(args.filter_height * args.filter_width, max_value); - args.mac_shift = args.mac_shift + args.pool_exponent - new_pool_exponent; - args.pool_exponent = new_pool_exponent; -#if CONFIG_ESP32P4_BOOST - args.avg_pool_area_inv = - tool::round(1.f / (args.filter_height * args.filter_width) * (1 << (-args.pool_exponent))); -#else - args.avg_pool_area_inv = (1 << (-args.pool_exponent)) / (args.filter_height * args.filter_width); -#endif - i_impl_func(output_yx, input_syx_real, (void *const)&args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - input_ptr_real += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - - for (size_t output_y = 0; output_y < n_h_tail; output_y++) { - input_syx_real = input_ptr_real; - output_yx = output_ptr; - args.filter_height = filter_h - args.padding_h_tail + (n_h_tail - 1 - output_y) * args.stride_y; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - args.padding_w_head + output_x * args.stride_x; - new_pool_exponent = -tool::calculate_exponent(args.filter_height * args.filter_width, max_value); - args.mac_shift = args.mac_shift + args.pool_exponent - new_pool_exponent; - args.pool_exponent = new_pool_exponent; -#if CONFIG_ESP32P4_BOOST - args.avg_pool_area_inv = - tool::round(1.f / (args.filter_height * args.filter_width) * (1 << (-args.pool_exponent))); -#else - args.avg_pool_area_inv = (1 << (-args.pool_exponent)) / (args.filter_height * args.filter_width); -#endif - i_impl_func(output_yx, input_syx_real, (void *const)&args); - output_yx += args.output_x_offset; - } - - input_syx_real += (args.stride_x * n_w_head - args.padding_w_head) * args.input_x_offset; - args.filter_width = filter_w; - new_pool_exponent = -tool::calculate_exponent(args.filter_height * args.filter_width, max_value); - args.mac_shift = args.mac_shift + args.pool_exponent - new_pool_exponent; - args.pool_exponent = new_pool_exponent; -#if CONFIG_ESP32P4_BOOST - args.avg_pool_area_inv = - tool::round(1.f / (args.filter_height * args.filter_width) * (1 << (-args.pool_exponent))); -#else - args.avg_pool_area_inv = (1 << (-args.pool_exponent)) / (args.filter_height * args.filter_width); -#endif - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func(output_yx, input_syx_real, (void *const)&args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = filter_w - args.padding_w_tail + (n_w_tail - 1 - output_x) * args.stride_x; - new_pool_exponent = -tool::calculate_exponent(args.filter_height * args.filter_width, max_value); - args.mac_shift = args.mac_shift + args.pool_exponent - new_pool_exponent; - args.pool_exponent = new_pool_exponent; -#if CONFIG_ESP32P4_BOOST - args.avg_pool_area_inv = - tool::round(1.f / (args.filter_height * args.filter_width) * (1 << (-args.pool_exponent))); -#else - args.avg_pool_area_inv = (1 << (-args.pool_exponent)) / (args.filter_height * args.filter_width); -#endif - i_impl_func(output_yx, input_syx_real, (void *const)&args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - input_ptr_real += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - } else // run c_impl_func -#endif - { - buffer_t *buffer = (buffer_t *)tool::calloc_aligned(args.output_channel, sizeof(buffer_t), 16, MALLOC_CAP_8BIT); - feature_t *input_syx_real = input_ptr_real; - feature_t *output_yx = output_ptr; - for (size_t output_y = 0; output_y < n_h_head; output_y++) { - output_yx = output_ptr; - input_syx_real = input_ptr_real; - args.filter_height = filter_h - args.padding_h_head + output_y * args.stride_y; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - args.padding_w_head + output_x * args.stride_x; - c_impl_func(buffer, input_syx_real, output_yx, args); - output_yx += args.output_x_offset; - } - - input_syx_real += (args.stride_x * n_w_head - args.padding_w_head) * args.input_x_offset; - args.filter_width = filter_w; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - c_impl_func(buffer, input_syx_real, output_yx, args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = args.padding_w_head + args.input_width - (n_w_head + n_w_body) * args.stride_x - - output_x * args.stride_x; - c_impl_func(buffer, input_syx_real, output_yx, args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - output_ptr += args.output_y_offset; - } - - input_ptr_real += (args.stride_y * n_h_head - args.padding_h_head) * args.input_y_offset; - args.filter_height = filter_h; - for (size_t output_y = 0; output_y < n_h_body; output_y++) { - input_syx_real = input_ptr_real; - output_yx = output_ptr; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - args.padding_w_head + output_x * args.stride_x; - c_impl_func(buffer, input_syx_real, output_yx, args); - output_yx += args.output_x_offset; - } - - input_syx_real += (args.stride_x * n_w_head - args.padding_w_head) * args.input_x_offset; - args.filter_width = filter_w; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - c_impl_func(buffer, input_syx_real, output_yx, args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = args.padding_w_head + args.input_width - (n_w_head + n_w_body) * args.stride_x - - output_x * args.stride_x; - c_impl_func(buffer, input_syx_real, output_yx, args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - input_ptr_real += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - - for (size_t output_y = 0; output_y < n_h_tail; output_y++) { - input_syx_real = input_ptr_real; - output_yx = output_ptr; - args.filter_height = args.padding_h_head + args.input_height - (n_h_head + n_h_body) * args.stride_y - - output_y * args.stride_y; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - args.padding_w_head + output_x * args.stride_x; - c_impl_func(buffer, input_syx_real, output_yx, args); - output_yx += args.output_x_offset; - } - - input_syx_real += (args.stride_x * n_w_head - args.padding_w_head) * args.input_x_offset; - args.filter_width = filter_w; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - c_impl_func(buffer, input_syx_real, output_yx, args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = args.padding_w_head + args.input_width - (n_w_head + n_w_body) * args.stride_x - - output_x * args.stride_x; - c_impl_func(buffer, input_syx_real, output_yx, args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - input_ptr_real += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - tool::free_aligned(buffer); - } - return; -} - -/** - * @brief - * - * @tparam feature_t - * @tparam buffer_t - * @param args - * @param i_impl_func - * @param i_impl_func_sp - * @param c_impl_func - * @param n_wise_tail - */ -template -void max_pool_shell(PoolArgsType &args, - void (*i_impl_func)(feature_t *, feature_t *, void *), - void (*i_impl_func_sp)(feature_t *, feature_t *, void *), - void (*c_impl_func)(buffer_t *, feature_t *, feature_t *, PoolArgsType &)) -{ - feature_t *input_ptr_real = (feature_t *)args.input_element; - feature_t *output_ptr = (feature_t *)args.output_element; - int n_h_head = (args.padding_h_head + args.stride_y - 1) / args.stride_y; - int n_w_head = (args.padding_w_head + args.stride_x - 1) / args.stride_x; - int n_h_tail = (args.padding_h_tail + args.stride_y - 1) / args.stride_y; - int n_w_tail = (args.padding_w_tail + args.stride_x - 1) / args.stride_x; - int n_h_body = args.output_height - n_h_tail - n_h_head; - int n_w_body = args.output_width - n_w_tail - n_w_head; - int filter_h = args.filter_height; - int filter_w = args.filter_width; - - if (i_impl_func && i_impl_func_sp) { - feature_t *input_syx_real = input_ptr_real; - feature_t *output_yx = output_ptr; - for (size_t output_y = 0; output_y < n_h_head; output_y++) { - output_yx = output_ptr; - input_syx_real = input_ptr_real; - args.filter_height = filter_h - args.padding_h_head + output_y * args.stride_y; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - args.padding_w_head + output_x * args.stride_x; - i_impl_func(output_yx, input_syx_real, (void *const)&args); - output_yx += args.output_x_offset; - } - - input_syx_real += (args.stride_x * n_w_head - args.padding_w_head) * args.input_x_offset; - args.filter_width = filter_w; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func(output_yx, input_syx_real, (void *const)&args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = filter_w - args.padding_w_tail + (n_w_tail - 1 - output_x) * args.stride_x; - i_impl_func(output_yx, input_syx_real, (void *const)&args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - output_ptr += args.output_y_offset; - } - - input_ptr_real += (args.stride_y * n_h_head - args.padding_h_head) * args.input_y_offset; - args.filter_height = filter_h; - - for (size_t output_y = 0; output_y < n_h_body; output_y++) { - input_syx_real = input_ptr_real; - output_yx = output_ptr; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - args.padding_w_head + output_x * args.stride_x; - i_impl_func(output_yx, input_syx_real, (void *const)&args); - output_yx += args.output_x_offset; - } - - input_syx_real += (args.stride_x * n_w_head - args.padding_w_head) * args.input_x_offset; - args.filter_width = filter_w; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func_sp(output_yx, input_syx_real, (void *const)&args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = filter_w - args.padding_w_tail + (n_w_tail - 1 - output_x) * args.stride_x; - i_impl_func(output_yx, input_syx_real, (void *const)&args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - input_ptr_real += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - - for (size_t output_y = 0; output_y < n_h_tail; output_y++) { - input_syx_real = input_ptr_real; - output_yx = output_ptr; - args.filter_height = filter_h - args.padding_h_tail + (n_h_tail - 1 - output_y) * args.stride_y; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - args.padding_w_head + output_x * args.stride_x; - i_impl_func(output_yx, input_syx_real, (void *const)&args); - output_yx += args.output_x_offset; - } - - input_syx_real += (args.stride_x * n_w_head - args.padding_w_head) * args.input_x_offset; - args.filter_width = filter_w; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - i_impl_func(output_yx, input_syx_real, (void *const)&args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = filter_w - args.padding_w_tail + (n_w_tail - 1 - output_x) * args.stride_x; - i_impl_func(output_yx, input_syx_real, (void *const)&args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - input_ptr_real += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - } else // run c_impl_func - { - buffer_t *buffer = (buffer_t *)tool::calloc_aligned(args.output_channel, sizeof(buffer_t), 16, MALLOC_CAP_8BIT); - feature_t *input_syx_real = input_ptr_real; - feature_t *output_yx = output_ptr; - for (size_t output_y = 0; output_y < n_h_head; output_y++) { - output_yx = output_ptr; - input_syx_real = input_ptr_real; - args.filter_height = filter_h - args.padding_h_head + output_y * args.stride_y; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - args.padding_w_head + output_x * args.stride_x; - c_impl_func(buffer, input_syx_real, output_yx, args); - output_yx += args.output_x_offset; - } - - input_syx_real += (args.stride_x * n_w_head - args.padding_w_head) * args.input_x_offset; - args.filter_width = filter_w; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - c_impl_func(buffer, input_syx_real, output_yx, args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = filter_w - args.padding_w_tail + (n_w_tail - 1 - output_x) * args.stride_x; - c_impl_func(buffer, input_syx_real, output_yx, args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - output_ptr += args.output_y_offset; - } - - input_ptr_real += (args.stride_y * n_h_head - args.padding_h_head) * args.input_y_offset; - args.filter_height = filter_h; - for (size_t output_y = 0; output_y < n_h_body; output_y++) { - input_syx_real = input_ptr_real; - output_yx = output_ptr; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - args.padding_w_head + output_x * args.stride_x; - c_impl_func(buffer, input_syx_real, output_yx, args); - output_yx += args.output_x_offset; - } - - input_syx_real += (args.stride_x * n_w_head - args.padding_w_head) * args.input_x_offset; - args.filter_width = filter_w; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - c_impl_func(buffer, input_syx_real, output_yx, args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = filter_w - args.padding_w_tail + (n_w_tail - 1 - output_x) * args.stride_x; - c_impl_func(buffer, input_syx_real, output_yx, args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - input_ptr_real += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - - for (size_t output_y = 0; output_y < n_h_tail; output_y++) { - input_syx_real = input_ptr_real; - output_yx = output_ptr; - args.filter_height = filter_h - args.padding_h_tail + (n_h_tail - 1 - output_y) * args.stride_y; - - for (size_t output_x = 0; output_x < n_w_head; output_x++) { - args.filter_width = filter_w - args.padding_w_head + output_x * args.stride_x; - c_impl_func(buffer, input_syx_real, output_yx, args); - output_yx += args.output_x_offset; - } - - input_syx_real += (args.stride_x * n_w_head - args.padding_w_head) * args.input_x_offset; - args.filter_width = filter_w; - for (size_t output_x = 0; output_x < n_w_body; output_x++) { - c_impl_func(buffer, input_syx_real, output_yx, args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - for (size_t output_x = 0; output_x < n_w_tail; output_x++) { - args.filter_width = filter_w - args.padding_w_tail + (n_w_tail - 1 - output_x) * args.stride_x; - c_impl_func(buffer, input_syx_real, output_yx, args); - input_syx_real += args.input_stride_x_offset; - output_yx += args.output_x_offset; - } - - input_ptr_real += args.input_stride_y_offset; - output_ptr += args.output_y_offset; - } - tool::free_aligned(buffer); - } - return; -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_prelu.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_prelu.cpp deleted file mode 100644 index 328c7e12..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_prelu.cpp +++ /dev/null @@ -1,97 +0,0 @@ -#include "dl_base_prelu.hpp" - -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -void prelu_11c(feature_t *output_ptr, feature_t *input_ptr, const ArgsType &args) -{ - buffer_t temp; - feature_t *alpha_ptr = (feature_t *)args.activation_alpha_ptr; - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - output_ptr[output_c] = input_ptr[output_c]; - if (output_ptr[output_c] < 0) { - temp = - DL_RIGHT_SHIFT((buffer_t)output_ptr[output_c] * (buffer_t)alpha_ptr[output_c], args.activation_shift); - tool::truncate(output_ptr[output_c], temp); - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize prelu -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -inline void load_prelu_11cn_s16(i_impl_acti_s16_t &i_impl_func, - c_impl_acti_s16_t &c_impl_func, - const ArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.input_stride_x_offset % 8 == 0 && args.output_x_offset % 8 == 0 && - !((unsigned)&args.input_element[0] & 15) && !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s16_prelu_11c; - } else { - i_impl_func = dl_tie728_s16_unaligned_prelu_11c; - } -#else - c_impl_func = prelu_11c; -#endif // CONFIG_TIE_BOOST -} - -template <> -void prelu(void *const args_ptr) -{ - const ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_acti_s16_t i_impl_func = NULL; - c_impl_acti_s16_t c_impl_func = NULL; - - load_prelu_11cn_s16(i_impl_func, c_impl_func, args); - - activation_shell(args, i_impl_func, c_impl_func); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize prelu -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -inline void load_prelu_11cn_s8(i_impl_acti_s8_t &i_impl_func, - c_impl_acti_s8_t &c_impl_func, - const ArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.input_stride_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && - !((unsigned)&args.input_element[0] & 15) && !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_esp32p4_s8_prelu_11c; - } else { - // i_impl_func = dl_esp32p4_s8_unaligned_prelu_11c; - } -#elif CONFIG_TIE728_BOOST - if (args.input_stride_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && - !((unsigned)&args.input_element[0] & 15) && !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s8_prelu_11c; - } else { - // i_impl_func = dl_tie728_s8_unaligned_prelu_11c; - } -#else - c_impl_func = prelu_11c; -#endif // CONFIG_TIE_BOOST -} - -template <> -void prelu(void *const args_ptr) -{ - const ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_acti_s8_t i_impl_func = NULL; - c_impl_acti_s8_t c_impl_func = NULL; - -#if CONFIG_ESP32P4_BOOST - dl_esp32p4_cfg_round(ROUND_MODE_HALF_EVEN); -#endif - load_prelu_11cn_s8(i_impl_func, c_impl_func, args); - - activation_shell(args, i_impl_func, c_impl_func); -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_prelu.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_prelu.hpp deleted file mode 100644 index 14294ed1..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_prelu.hpp +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief prelu - * - * @tparam feature_t - */ -template -void prelu(void *const args_ptr); -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_relu.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_relu.cpp deleted file mode 100644 index dbc7a36c..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_relu.cpp +++ /dev/null @@ -1,80 +0,0 @@ -#include "dl_base_relu.hpp" - -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -void relu_11c(feature_t *output_ptr, feature_t *input_ptr, const ArgsType &args) -{ - for (size_t output_c = 0; output_c < args.output_channel; output_c++) { - output_ptr[output_c] = input_ptr[output_c] < 0 ? 0 : input_ptr[output_c]; - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize relu -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -inline void load_relu_11cn_s16(i_impl_acti_s16_t &i_impl_func, - c_impl_acti_s16_t &c_impl_func, - const ArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.input_stride_x_offset % 8 == 0 && args.output_x_offset % 8 == 0 && - !((unsigned)&args.input_element[0] & 15) && !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s16_relu_11c; - } else { - i_impl_func = dl_tie728_s16_unaligned_relu_11c; - } -#else - c_impl_func = relu_11c; -#endif // CONFIG_TIE_BOOST -} - -template <> -void relu(void *const args_ptr) -{ - const ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_acti_s16_t i_impl_func = NULL; - c_impl_acti_s16_t c_impl_func = NULL; - - load_relu_11cn_s16(i_impl_func, c_impl_func, args); - - activation_shell(args, i_impl_func, c_impl_func); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize relu -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -inline void load_relu_11cn_s8(i_impl_acti_s8_t &i_impl_func, - c_impl_acti_s8_t &c_impl_func, - const ArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.input_stride_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && - !((unsigned)&args.input_element[0] & 15) && !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s8_relu_11c; - } else { - i_impl_func = dl_tie728_s8_unaligned_relu_11c; - } -#else - c_impl_func = relu_11c; -#endif // CONFIG_TIE_BOOST -} - -template <> -void relu(void *const args_ptr) -{ - const ArgsType &args = *((ArgsType *)args_ptr); - - i_impl_acti_s8_t i_impl_func = NULL; - c_impl_acti_s8_t c_impl_func = NULL; - - load_relu_11cn_s8(i_impl_func, c_impl_func, args); - - activation_shell(args, i_impl_func, c_impl_func); -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_relu.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_relu.hpp deleted file mode 100644 index 09355dc9..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_relu.hpp +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief relu - * - * @tparam feature_t - */ -template -void relu(void *const args_ptr); -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_resize2d.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_resize2d.cpp deleted file mode 100644 index a6846bc9..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_resize2d.cpp +++ /dev/null @@ -1,76 +0,0 @@ -#include "dl_base_resize2d.hpp" - -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -inline void resize2d_nearest_2x2_c1(feature_t *output_ptr, feature_t *input_ptr, const resizeArgsType &args) -{ - feature_t *output_ptr_0_0 = output_ptr; - feature_t *output_ptr_0_1 = output_ptr + args.output_x_offset; - feature_t *output_ptr_1_0 = output_ptr + args.output_y_offset; - feature_t *output_ptr_1_1 = output_ptr_1_0 + args.output_x_offset; - - for (int i = 0; i < args.input_channel; i++) { - feature_t output_value = tool::round((float)(*input_ptr++) * args.output_scale / (1 << args.output_shift)); - *(output_ptr_0_0++) = output_value; - *(output_ptr_0_1++) = output_value; - *(output_ptr_1_0++) = output_value; - *(output_ptr_1_1++) = output_value; - } -} - -inline void load_resized2d_nearest_2x2_c1_s8(resize_i_impl_func_s8_t &i_impl_func, - resize_c_impl_func_s8_t &c_impl_func, - const resizeArgsType &args) -{ -#if CONFIG_ESP32P4_BOOST - if (args.input_channel % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_esp32p4_s8_resize2d_nearest_2x2_c1; - } else { - i_impl_func = dl_esp32p4_s8_unaligned_resize2d_nearest_2x2_c1; - } -#elif CONFIG_TIE728_BOOST - if (args.input_channel % 16 == 0 && !((unsigned)&args.input_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - i_impl_func = dl_tie728_s8_resize2d_nearest_2x2_c1; - } else { - i_impl_func = dl_tie728_s8_unaligned_resize2d_nearest_2x2_c1; - } -#else - c_impl_func = resize2d_nearest_2x2_c1; -#endif -} - -template <> -void resize2d(void *args_ptr) -{ - const resizeArgsType &args = *((resizeArgsType *)args_ptr); - if (args.resize_type == RESIZE_NEAREST) { - if (args.scale_y == 2 && args.scale_x == 2) { - resize_i_impl_func_s8_t i_impl_func = NULL; - resize_c_impl_func_s8_t c_impl_func = NULL; - load_resized2d_nearest_2x2_c1_s8(i_impl_func, c_impl_func, args); - resize2d_operation_shell(args, i_impl_func, c_impl_func); - } - } -} - -template <> -void resize2d(void *args_ptr) -{ - // const resizeArgsType &args = *((resizeArgsType *)args_ptr); - // if (args.resize_type == RESIZE_NEAREST){ - // if (args.scale_y == 2 && args.scale_x == 2){ - // resize_i_impl_func_s8_t i_impl_func = NULL; - // resize_c_impl_func_s8_t c_impl_func = NULL; - // load_resized2d_nearest_2x2_c1_s16(i_impl_func, c_impl_func, args); - // resize2d_operation_shell(args, i_impl_func, c_impl_func); - // } - // } -} - -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_resize2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_resize2d.hpp deleted file mode 100644 index c1a9ba1f..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_resize2d.hpp +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief - * - * @tparam feature_t - * @param args_ptr - */ -template -void resize2d(void *args_ptr); -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_shape.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_shape.cpp deleted file mode 100644 index 201fa4a9..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_shape.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#pragma once - -#include "dl_base_shape.hpp" -#include "dl_define.hpp" - -namespace dl { -namespace base { - -/** -In ONNX, a set of tensors are multidirectional broadcastable to the same shape if one of the following is true: - -The tensors all have exactly the same shape. -The tensors all have the same number of dimensions and the length of each dimensions is either a common length or 1. -The tensors that have too few dimensions can have their shapes prepended with a dimension of length 1 to satisfy -property 2. For example, the following tensor shapes are supported by multidirectional broadcasting: - -shape(A) = (2, 3, 4, 5), shape(B) = (,), i.e. B is a scalar ==> shape(result) = (2, 3, 4, 5) -shape(A) = (2, 3, 4, 5), shape(B) = (5,), ==> shape(result) = (2, 3, 4, 5) -shape(A) = (4, 5), shape(B) = (2, 3, 4, 5), ==> shape(result) = (2, 3, 4, 5) -shape(A) = (1, 4, 5), shape(B) = (2, 3, 1, 1), ==> shape(result) = (2, 3, 4, 5) -shape(A) = (3, 4, 5), shape(B) = (2, 1, 1, 1), ==> shape(result) = (2, 3, 4, 5) -*/ -std::vector get_multidirectional_broadcasting_shape(const std::vector &shape1, const std::vector &shape2) -{ - int dim = shape1.size(); - if (dim < shape2.size()) { - dim = shape2.size(); - } - - std::vector output_shape(dim); - for (int i = dim - 1; i >= 0; i--) { - int index1 = -1; - int index2 = -1; - int dim1 = 0; - int dim2 = 0; - - index1 = shape1.size() - (dim - i); - index2 = shape2.size() - (dim - i); - if (index1 >= 0) - dim1 = shape1[index1]; - if (index2 >= 0) - dim2 = shape2[index2]; - output_shape[i] = DL_MAX(dim1, dim2); - } - - return output_shape; -} - -/** -In ONNX, tensor B is unidirectional broadcastable to tensor A if one of the following is true: - -Tensor A and B both have exactly the same shape. -Tensor A and B all have the same number of dimensions and the length of each dimensions is either a common length or B's -length is 1. Tensor B has too few dimensions, and B can have its shapes prepended with a dimension of length 1 to -satisfy property 2. When unidirectional broadcasting happens, the output's shape is the same as the shape of A (i.e., -the larger shape of two input tensors). - -In the following examples, tensor B is unidirectional broadcastable to tensor A: - -shape(A) = (2, 3, 4, 5), shape(B) = (,), i.e. B is a scalar ==> shape(result) = (2, 3, 4, 5) -shape(A) = (2, 3, 4, 5), shape(B) = (5,), ==> shape(result) = (2, 3, 4, 5) -shape(A) = (2, 3, 4, 5), shape(B) = (2, 1, 1, 5), ==> shape(result) = (2, 3, 4, 5) -shape(A) = (2, 3, 4, 5), shape(B) = (1, 3, 1, 5), ==> shape(result) = (2, 3, 4, 5) -*/ -std::vector get_unidirectional_broadcasting_shape(const std::vector &shape1, const std::vector &shape2) -{ - return std::vector(shape1); -} - -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_shape.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_shape.hpp deleted file mode 100644 index 1baa315c..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_shape.hpp +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief multidirectional broadcasting - * refer to https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md - * - * @param shape1 Shape of input1 - * @param shape2 Shape of input2 - * - * @return Shape of output - */ -std::vector get_multidirectional_broadcasting_shape(const std::vector &shape1, - const std::vector &shape2); - -/** - * @brief unidirectional broadcasting - * refer to https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md - * - * @param shape1 Shape of input1 - * @param shape2 Shape of input2 - * - * @return Shape of output - */ -std::vector get_unidirectional_broadcasting_shape(const std::vector &shape1, const std::vector &shape2); - -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_sub2d.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_sub2d.cpp deleted file mode 100644 index 84932a80..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_sub2d.cpp +++ /dev/null @@ -1,218 +0,0 @@ -#include "dl_base_sub2d.hpp" - -#include "dl_base_activate_output.hpp" -#include "dl_base_isa.hpp" - -namespace dl { -namespace base { -template -inline void sub2d_11c(feature_t *output_ptr, - feature_t *input0_ptr, - feature_t *input1_ptr, - const arithArgsType &args) -{ - buffer_t buffer; - for (size_t output_c = 0; output_c < args.channel; output_c++) // C - { - buffer = (buffer_t)input0_ptr[output_c] - (buffer_t)input1_ptr[output_c]; - tool::truncate(output_ptr[output_c], buffer); - } -} - -template -inline void sub2d_11c_rescale(feature_t *output_ptr, - feature_t *input0_ptr, - feature_t *input1_ptr, - const arithArgsType &args) -{ - buffer_t buffer; - if (args.rescale_input < 2) { - for (size_t output_c = 0; output_c < args.channel; output_c++) // C - { - buffer = - (buffer_t)input0_ptr[output_c] - (buffer_t)(DL_RIGHT_SHIFT(input1_ptr[output_c], args.input_shift)); - buffer = DL_RIGHT_SHIFT(buffer * args.output_scale, args.output_shift); - tool::truncate(output_ptr[output_c], buffer); - } - } else { - for (size_t output_c = 0; output_c < args.channel; output_c++) // C - { - buffer = - (buffer_t)(DL_RIGHT_SHIFT(input1_ptr[output_c], args.input_shift)) - (buffer_t)input0_ptr[output_c]; - buffer = DL_RIGHT_SHIFT(buffer * args.output_scale, args.output_shift); - tool::truncate(output_ptr[output_c], buffer); - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize sub2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -inline void load_sub2d_11c_s16(arith_i_impl_func_s16_t &i_impl_func, - arith_c_impl_func_s16_t &c_impl_func, - arith_n_wise_tail_s16_t &n_wise_tail, - const arithArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.input0_x_offset % 8 == 0 && args.input1_x_offset % 8 == 0 && args.output_x_offset % 8 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - - if (args.input_shift == -1) { - i_impl_func = dl_tie728_s16_sub2d_11c; - } else - i_impl_func = dl_tie728_s16_rescale_sub2d_11c; - break; - case ReLU: - case LeakyReLU: - if (args.input_shift == -1) { - i_impl_func = dl_tie728_s16_sub2d_11c_relu; - } else - i_impl_func = dl_tie728_s16_rescale_sub2d_11c_relu; - break; - case PReLU: - if (args.input_shift == -1) { - i_impl_func = dl_tie728_s16_sub2d_11c_prelu; - } else - i_impl_func = dl_tie728_s16_rescale_sub2d_11c_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_tie728_s16_unaligned_sub2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_tie728_s16_unaligned_sub2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_tie728_s16_unaligned_sub2d_11c_prelu; - break; - } - } -#else - if (args.input_shift == -1) - c_impl_func = sub2d_11c; - else - c_impl_func = sub2d_11c_rescale; - - switch (args.activation_type) { - case Linear: - n_wise_tail = NULL; - break; - case ReLU: - n_wise_tail = arith_output_relu; - break; - case LeakyReLU: - n_wise_tail = arith_output_leakyrelu; - break; - case PReLU: - n_wise_tail = arith_output_prelu; - break; - } -#endif // CONFIG_TIE728_BOOST -} - -template <> -void sub2d(void *const args_ptr) -{ - const arithArgsType &args = *((arithArgsType *)args_ptr); - - arith_i_impl_func_s16_t i_impl_func = NULL; - arith_c_impl_func_s16_t c_impl_func = NULL; - arith_n_wise_tail_s16_t n_wise_tail = NULL; - - load_sub2d_11c_s16(i_impl_func, c_impl_func, n_wise_tail, args); - - arith_operation_shell(args, i_impl_func, c_impl_func, n_wise_tail); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// specialize sub2d -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -inline void load_sub2d_11c_s8(arith_i_impl_func_s8_t &i_impl_func, - arith_c_impl_func_s8_t &c_impl_func, - arith_n_wise_tail_s8_t &n_wise_tail, - const arithArgsType &args) -{ -#if CONFIG_TIE728_BOOST - if (args.input0_x_offset % 16 == 0 && args.input1_x_offset % 16 == 0 && args.output_x_offset % 16 == 0 && - !((unsigned)&args.input0_element[0] & 15) && !((unsigned)&args.input1_element[0] & 15) && - !((unsigned)&args.output_element[0] & 15)) { - switch (args.activation_type) { - case Linear: - - if (args.input_shift == -1) { - i_impl_func = dl_tie728_s8_sub2d_11c; - } else - i_impl_func = dl_tie728_s8_rescale_sub2d_11c; - break; - case ReLU: - case LeakyReLU: - if (args.input_shift == -1) { - i_impl_func = dl_tie728_s8_sub2d_11c_relu; - } else - i_impl_func = dl_tie728_s8_rescale_sub2d_11c_relu; - break; - case PReLU: - if (args.input_shift == -1) { - i_impl_func = dl_tie728_s8_sub2d_11c_prelu; - } else - i_impl_func = dl_tie728_s8_rescale_sub2d_11c_prelu; - break; - } - } else { - switch (args.activation_type) { - case Linear: - i_impl_func = dl_tie728_s8_unaligned_sub2d_11c; - break; - case ReLU: - case LeakyReLU: - i_impl_func = dl_tie728_s8_unaligned_sub2d_11c_relu; - break; - case PReLU: - i_impl_func = dl_tie728_s8_unaligned_sub2d_11c_prelu; - break; - } - } -#else - if (args.input_shift == -1) - c_impl_func = sub2d_11c; - else - c_impl_func = sub2d_11c_rescale; - - switch (args.activation_type) { - case Linear: - n_wise_tail = NULL; - break; - case ReLU: - n_wise_tail = arith_output_relu; - break; - case LeakyReLU: - n_wise_tail = arith_output_leakyrelu; - break; - case PReLU: - n_wise_tail = arith_output_prelu; - break; - } -#endif // CONFIG_TIE728_BOOST -} - -template <> -void sub2d(void *const args_ptr) -{ - const arithArgsType &args = *((arithArgsType *)args_ptr); - - arith_i_impl_func_s8_t i_impl_func = NULL; - arith_c_impl_func_s8_t c_impl_func = NULL; - arith_n_wise_tail_s8_t n_wise_tail = NULL; - - load_sub2d_11c_s8(i_impl_func, c_impl_func, n_wise_tail, args); - - arith_operation_shell(args, i_impl_func, c_impl_func, n_wise_tail); -} -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_sub2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_sub2d.hpp deleted file mode 100644 index c680fbdd..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/dl_base_sub2d.hpp +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#include "dl_base.hpp" - -namespace dl { -namespace base { -/** - * @brief sub2d - * - * @tparam feature_t - * @param args_ptr - */ -template -void sub2d(void *const args_ptr); - -} // namespace base -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/dl_base_isa.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/dl_base_isa.hpp deleted file mode 100644 index 97855f69..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/dl_base_isa.hpp +++ /dev/null @@ -1,412 +0,0 @@ -#pragma once - -extern "C" { -#if CONFIG_XTENSA_BOOST -void dl_xtensa_s16_conv2d_11cn_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_xtensa_s16_conv2d_11cn_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_xtensa_s16_conv2d_11cn(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_xtensa_s16_conv2d_11cn_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_xtensa_s16_conv2d_33cn_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_xtensa_s16_conv2d_33cn_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_xtensa_s16_conv2d_33cn(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_xtensa_s16_conv2d_33cn_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_xtensa_s16_conv2d_hwcn_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_xtensa_s16_conv2d_hwcn_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_xtensa_s16_conv2d_hwcn(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_xtensa_s16_conv2d_hwcn_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -#endif - -#if CONFIG_TIE728_BOOST -void dl_tie728_s16_conv2d_11cn_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_11cn_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_11cn_bias_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_11cn(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_11cn_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_11cn_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_unaligned_conv2d_11cn(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_11cn_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_11cn_leakyrelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_11cn_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_11cn_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_11cn_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_11cn_bias_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_conv2d_33cn_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_33cn_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_33cn_bias_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_33cn(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_33cn_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_33cn_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_unaligned_conv2d_33cn(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_33cn_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_33cn_leakyrelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_33cn_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_33cn_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_33cn_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_33cn_bias_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_conv2d_hwcn_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_hwcn_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_hwcn_bias_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_hwcn(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_hwcn_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_conv2d_hwcn_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_unaligned_conv2d_hwcn(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_hwcn_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_hwcn_leakyrelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_hwcn_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_hwcn_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_hwcn_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_conv2d_hwcn_bias_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_depthwise_conv2d_33c1_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_depthwise_conv2d_33c1_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_depthwise_conv2d_33c1_bias_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_depthwise_conv2d_33c1(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_depthwise_conv2d_33c1_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_depthwise_conv2d_33c1_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_unaligned_depthwise_conv2d_33c1(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_depthwise_conv2d_33c1_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_depthwise_conv2d_33c1_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_depthwise_conv2d_hwc1_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_depthwise_conv2d_hwc1_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_depthwise_conv2d_hwc1_bias_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_depthwise_conv2d_hwc1(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_depthwise_conv2d_hwc1_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_depthwise_conv2d_hwc1_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_unaligned_depthwise_conv2d_hwc1(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_max_pool2d_hwc1(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_max_pool2d_22c1(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_max_pool2d_hwc1(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_max_pool2d_22c1(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_avg_pool2d_hwc1(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_avg_pool2d_22c1(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_avg_pool2d_hwc1(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_avg_pool2d_22c1(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_add2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_add2d_11c_relu(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_add2d_11c_prelu(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_rescale_add2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_rescale_add2d_11c_relu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); -void dl_tie728_s16_rescale_add2d_11c_prelu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); -void dl_tie728_s16_unaligned_add2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_add2d_11c_relu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); -void dl_tie728_s16_unaligned_add2d_11c_prelu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); - -void dl_tie728_s16_sub2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_sub2d_11c_relu(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_sub2d_11c_prelu(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_rescale_sub2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_rescale_sub2d_11c_relu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); -void dl_tie728_s16_rescale_sub2d_11c_prelu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); -void dl_tie728_s16_unaligned_sub2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_sub2d_11c_relu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); -void dl_tie728_s16_unaligned_sub2d_11c_prelu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); - -void dl_tie728_s16_mul2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_mul2d_11c_relu(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_mul2d_11c_prelu(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_mul2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_mul2d_11c_relu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); -void dl_tie728_s16_unaligned_mul2d_11c_prelu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); - -void dl_tie728_s16_max2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_max2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); - -void dl_tie728_s16_min2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_min2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); - -void dl_tie728_s16_relu_11c(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_relu_11c(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_tie728_s16_prelu_11c(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_tie728_s16_unaligned_prelu_11c(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -/* Int8 API */ -void dl_tie728_s8_conv2d_11cn(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_conv2d_11cn_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_conv2d_11cn_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_conv2d_11cn(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_tie728_s8_conv2d_33cn(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_conv2d_33cn_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_conv2d_33cn_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_conv2d_33cn(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_tie728_s8_conv2d_hwcn(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_conv2d_hwcn_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_conv2d_hwcn_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_conv2d_hwcn(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_tie728_s8_depthwise_conv2d_33c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_depthwise_conv2d_33c1_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_depthwise_conv2d_33c1_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_depthwise_conv2d_33c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_tie728_s8_depthwise_conv2d_hwc1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_depthwise_conv2d_hwc1_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_depthwise_conv2d_hwc1_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_depthwise_conv2d_hwc1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_tie728_s8_max_pool2d_22c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_max_pool2d_22c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_tie728_s8_max_pool2d_hwc1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_max_pool2d_hwc1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_tie728_s8_avg_pool2d_22c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_avg_pool2d_22c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_tie728_s8_avg_pool2d_hwc1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_avg_pool2d_hwc1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_tie728_s8_add2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_add2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_add2d_11c_prelu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_rescale_add2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_rescale_add2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_rescale_add2d_11c_prelu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_add2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_add2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_add2d_11c_prelu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); - -void dl_tie728_s8_sub2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_sub2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_sub2d_11c_prelu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_rescale_sub2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_rescale_sub2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_rescale_sub2d_11c_prelu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_sub2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_sub2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_sub2d_11c_prelu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); - -void dl_tie728_s8_mul2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_mul2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_mul2d_11c_prelu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_mul2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_mul2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_mul2d_11c_prelu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); - -void dl_tie728_s8_max2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_max2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); - -void dl_tie728_s8_min2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_min2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); - -void dl_tie728_s8_relu_11c(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_relu_11c(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_tie728_s8_prelu_11c(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -// void dl_tie728_s8_unaligned_prelu_11c(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_tie728_s8_resize2d_nearest_2x2_c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_tie728_s8_unaligned_resize2d_nearest_2x2_c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -#endif - -#if CONFIG_IDF_TARGET_ESP32P4 -/* Int16 API */ -void dl_esp32p4_s16_conv2d_11cn_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_esp32p4_s16_conv2d_11cn_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_esp32p4_s16_conv2d_11cn(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_esp32p4_s16_conv2d_11cn_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s16_conv2d_33cn_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_esp32p4_s16_conv2d_33cn_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_esp32p4_s16_conv2d_33cn(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_esp32p4_s16_conv2d_33cn_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s16_conv2d_hwcn_bias(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_esp32p4_s16_conv2d_hwcn_bias_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_esp32p4_s16_conv2d_hwcn(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); -void dl_esp32p4_s16_conv2d_hwcn_relu(int16_t *output_ptr, int16_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s16_add2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s16_add2d_11c_relu(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s16_add2d_11c_prelu(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s16_rescale_add2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s16_rescale_add2d_11c_relu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); -void dl_esp32p4_s16_rescale_add2d_11c_prelu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); -void dl_esp32p4_s16_unaligned_add2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s16_unaligned_add2d_11c_relu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); -void dl_esp32p4_s16_unaligned_add2d_11c_prelu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); - -void dl_esp32p4_s16_mul2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s16_mul2d_11c_relu(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s16_mul2d_11c_prelu(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s16_unaligned_mul2d_11c(int16_t *output_ptr, int16_t *input0_ptr, int16_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s16_unaligned_mul2d_11c_relu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); -void dl_esp32p4_s16_unaligned_mul2d_11c_prelu(int16_t *output_ptr, - int16_t *input0_ptr, - int16_t *input1_ptr, - void *args_ptr); - -/* Int8 API */ -void dl_esp32p4_s8_conv2d_11cn_bias(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_11cn_bias_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_11cn_bias_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_11cn(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_11cn_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_11cn_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s8_unaligned_conv2d_11cn_bias(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_11cn_bias_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_11cn(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_11cn_leakyrelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_11cn_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s8_conv2d_33cn_bias(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_33cn_bias_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_33cn_bias_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_33cn(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_33cn_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_33cn_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s8_unaligned_conv2d_33cn_bias(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_33cn_bias_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_33cn(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_33cn_leakyrelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_33cn_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s8_conv2d_hwcn_bias(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_hwcn_bias_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_hwcn_bias_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_hwcn(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_hwcn_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_conv2d_hwcn_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s8_unaligned_conv2d_hwcn_bias(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_hwcn(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_conv2d_hwcn_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s8_depthwise_conv2d_33c1_bias(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_depthwise_conv2d_33c1_bias_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_depthwise_conv2d_33c1_bias_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_depthwise_conv2d_33c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_depthwise_conv2d_33c1_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_depthwise_conv2d_33c1_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s8_depthwise_conv2d_hwc1_bias(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_depthwise_conv2d_hwc1_bias_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_depthwise_conv2d_hwc1_bias_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_depthwise_conv2d_hwc1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_depthwise_conv2d_hwc1_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_depthwise_conv2d_hwc1_prelu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s8_mul2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_mul2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_mul2d_11c_prelu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_mul2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_mul2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_mul2d_11c_prelu(int8_t *output_ptr, - int8_t *input0_ptr, - int8_t *input1_ptr, - void *args_ptr); - -void dl_esp32p4_s8_add2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_add2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_add2d_11c_prelu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_rescale_add2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_rescale_add2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_rescale_add2d_11c_prelu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_add2d_11c(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_add2d_11c_relu(int8_t *output_ptr, int8_t *input0_ptr, int8_t *input1_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_add2d_11c_prelu(int8_t *output_ptr, - int8_t *input0_ptr, - int8_t *input1_ptr, - void *args_ptr); - -void dl_esp32p4_s8_avg_pool2d_22c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_avg_pool2d_22c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_avg_pool2d_hwc1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_avg_pool2d_hwc1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s8_resize2d_nearest_2x2_c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -void dl_esp32p4_s8_unaligned_resize2d_nearest_2x2_c1(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -void dl_esp32p4_s8_prelu_11c(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); -// void dl_esp32p4_s8_unaligned_prelu_11c(int8_t *output_ptr, int8_t *input_ptr, void *args_ptr); - -#endif -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_common.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_common.S deleted file mode 100644 index d6b10817..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_common.S +++ /dev/null @@ -1,166 +0,0 @@ -############################################################################################################################################################ -# esp32p4_common series -############################################################################################################################################################ -.macro esp32p4_push_4_stacks_1r value_0 - # The order of register arguments for push and pop must be consistent. - addi sp, sp, -4 - sw \value_0, 0(sp) -.endm - - - -.macro esp32p4_pop_4_stacks_1r value_0 - # The order of register arguments for push and pop must be consistent. - lw \value_0, 0(sp) - addi sp, sp, 4 -.endm - - - -.macro esp32p4_push_12_stacks_3r value_0, value_1, value_2 - # The order of register arguments for push and pop must be consistent. - addi sp, sp, -12 - sw \value_0, 8(sp) - sw \value_1, 4(sp) - sw \value_2, 0(sp) -.endm - - - -.macro esp32p4_pop_12_stacks_3r value_0, value_1, value_2 - # The order of register arguments for push and pop must be consistent. - lw \value_0, 8(sp) - lw \value_1, 4(sp) - lw \value_2, 0(sp) - addi sp, sp, 12 -.endm - - - -.macro esp32p4_push_20_stacks_5r value_0, value_1, value_2, value_3, value_4 - # The order of register arguments for push and pop must be consistent. - addi sp, sp, -20 - sw \value_0, 16(sp) - sw \value_1, 12(sp) - sw \value_2, 8(sp) - sw \value_3, 4(sp) - sw \value_4, 0(sp) -.endm - - - -.macro esp32p4_pop_20_stacks_5r value_0, value_1, value_2, value_3, value_4 - # The order of register arguments for push and pop must be consistent. - lw \value_0, 16(sp) - lw \value_1, 12(sp) - lw \value_2, 8(sp) - lw \value_3, 4(sp) - lw \value_4, 0(sp) - addi sp, sp, 20 -.endm - - - -.macro esp32p4_push_28_stacks_7r value_0, value_1, value_2, value_3, value_4, value_5, value_6 - # The order of register arguments for push and pop must be consistent. - addi sp, sp, -28 - sw \value_0, 24(sp) - sw \value_1, 20(sp) - sw \value_2, 16(sp) - sw \value_3, 12(sp) - sw \value_4, 8(sp) - sw \value_5, 4(sp) - sw \value_6, 0(sp) -.endm - - - -.macro esp32p4_pop_28_stacks_7r value_0, value_1, value_2, value_3, value_4, value_5, value_6 - # The order of register arguments for push and pop must be consistent. - lw \value_0, 24(sp) - lw \value_1, 20(sp) - lw \value_2, 16(sp) - lw \value_3, 12(sp) - lw \value_4, 8(sp) - lw \value_5, 4(sp) - lw \value_6, 0(sp) - addi sp, sp, 28 -.endm - - - -.macro esp32p4_push_36_stacks_9r value_0, value_1, value_2, value_3, value_4, value_5, value_6, value_7, value_8 - # The order of register arguments for push and pop must be consistent. - addi sp, sp, -36 - sw \value_0, 32(sp) - sw \value_1, 28(sp) - sw \value_2, 24(sp) - sw \value_3, 20(sp) - sw \value_4, 16(sp) - sw \value_5, 12(sp) - sw \value_6, 8(sp) - sw \value_7, 4(sp) - sw \value_8, 0(sp) -.endm - - - -.macro esp32p4_pop_36_stacks_9r value_0, value_1, value_2, value_3, value_4, value_5, value_6, value_7, value_8 - # The order of register arguments for push and pop must be consistent. - lw \value_0, 32(sp) - lw \value_1, 28(sp) - lw \value_2, 24(sp) - lw \value_3, 20(sp) - lw \value_4, 16(sp) - lw \value_5, 12(sp) - lw \value_6, 8(sp) - lw \value_7, 4(sp) - lw \value_8, 0(sp) - addi sp, sp, 36 -.endm - - - -.macro esp32p4_push_128_stacks_4r value_0, value_1, value_2, value_3 - # The order of register arguments for push and pop must be consistent. - addi sp, sp, -128 - sw \value_0, 124(sp) - sw \value_1, 120(sp) - sw \value_2, 116(sp) - sw \value_3, 112(sp) -.endm - - - -.macro esp32p4_pop_128_stacks_4r value_0, value_1, value_2, value_3 - # The order of register arguments for push and pop must be consistent. - lw \value_0, 124(sp) - lw \value_1, 120(sp) - lw \value_2, 116(sp) - lw \value_3, 112(sp) - addi sp, sp, 128 -.endm - - - -.macro esp32p4_clamp input, min, max - // check input and min - blt \input, \min, 0f // if input < min - - // check input and max - blt \max, \input, 1f // if max < input - - // If the input value is already within the range, there is no need for clamping; proceed directly to the end. - j 2f - - 0: - // If the input value is less than the minimum value, assign the minimum value to the result register. - mv \input, \min - j 2f - - 1: - // If the input value exceeds the maximum value, assign the maximum value to the result register. - mv \input, \max - - 2: -.endm \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_prelu.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_prelu.S deleted file mode 100644 index 4bfcaaad..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_prelu.S +++ /dev/null @@ -1,75 +0,0 @@ -#include "dl_esp32p4_s8.S" -#include "dl_esp32p4_common.S" - - .align 2 - .text - .global dl_esp32p4_s8_prelu_11c - .type dl_esp32p4_s8_prelu_11c, @function -dl_esp32p4_s8_prelu_11c: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - # a3: c_div_x_1 - # s8: activation_alpha_ptr - # s9: activation_shift - # t3: output_shift - # t4: output_scale - - - lw a3, 100(a2) - lw s8, 80(a2) # activation_alpha_ptr - lw s9, 84(a2) # activation_shift - lw t3, 172(a2) # output_shift - lw t4, 176(a2) # output_scale - - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, s8, 16 - sb x0, 0(sp) - add s0, sp, x0 - esp.vldbc.8.ip q2, s0, 0 # all 0 - sb t4, 0(sp) - add s0, sp, x0 - esp.vldbc.8.ip q3, s0, 0 # all output_scale - - add t0, a3, x0 - blez t0, 1f - 0: - esp.vcmp.gt.s8 q4, q0, q2 - esp.notq q5, q4 - - esp.vprelu.s8 q1, q0, q1, s9 - - # *scale/right shift: output - input - esp.zero.qacc - esp.vmulas.s8.qacc q0, q3 - esp.srcmb.s8.qacc q0, t3, 1 - esp.andq q0, q0, q4 - esp.andq q1, q1, q5 - esp.vadd.s8.ld.incp q1, s8, q0, q0, q1 - - esp.vst.128.ip q0, a0, 16 - esp.vld.128.ip q0, a1, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - esp.vcmp.gt.s8 q4, q0, q2 - esp.notq q5, q4 - - esp.vprelu.s8 q1, q0, q1, s9 - - # *scale/right shift: output - input - esp.zero.qacc - esp.vmulas.s8.qacc q0, q3 - esp.srcmb.s8.qacc q0, t3, 1 - esp.andq q0, q0, q4 - esp.andq q1, q1, q5 - esp.vadd.s8 q0, q0, q1 - - esp.vst.128.ip q0, a0, 0 - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16.S deleted file mode 100644 index 40241cda..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16.S +++ /dev/null @@ -1,177 +0,0 @@ -#pragma once - - -############################################################################################################################################################ -# result process for conv2d / depthwise_conv2d -############################################################################################################################################################ -.macro esp32p4_s16_conv2d_128b_vector_bias bias_ptr - esp.ld.qacc.l.l.128.ip \bias_ptr, 16 - esp.ld.qacc.l.h.128.ip \bias_ptr, 16 - esp.ld.qacc.h.l.128.ip \bias_ptr, 16 - esp.ld.qacc.h.h.128.ip \bias_ptr, 16 -.endm - - - -############################################################################################################################################################ -# esp32p4_s16_128b_vector series -############################################################################################################################################################ -.macro esp32p4_s16_128b_vector_shift_result output_v, mac_shift - esp.srcmb.s16.qacc \output_v, \mac_shift, 1 -.endm - - - -.macro esp32p4_s16_128b_aligned_vector_store output_v, output_ptr - esp.vst.128.ip \output_v, \output_ptr, 16 -.endm - - - -.macro esp32p4_s16_128b_vector_relu output_v, activation_alpha, activation_shift - # LeakyReLU - esp.vrelu.s16 \output_v, \activation_alpha, \activation_shift -.endm - -.macro dl_esp32p4_128b_unaligned_store0 output_v, output_ptr, tmp32 - esp.movi.32.a \output_v, \tmp32, 0 - sw \tmp32, 0(\output_ptr) - esp.movi.32.a \output_v, \tmp32, 1 - sw \tmp32, 4(\output_ptr) - esp.movi.32.a \output_v, \tmp32, 2 - sw \tmp32, 8(\output_ptr) - esp.movi.32.a \output_v, \tmp32, 3 - sw \tmp32, 12(\output_ptr) - addi \output_ptr, \output_ptr, 16 -.endm - -.macro dl_esp32p4_128b_unaligned_store1 output_v, output_ptr - esp.vst.l.64.ip \output_v, \output_ptr, 8 - esp.vst.h.64.ip \output_v, \output_ptr, 8 -.endm - -.macro dl_esp32p4_128b_last_store_data tmp_q, output_v, tmp_a, c_remainder_bytes - beqz \c_remainder_bytes, 600f - li \tmp_a, 15 - sub \tmp_a, \tmp_a, \c_remainder_bytes - li \c_remainder_bytes, 0 - esp.slcxxp.2q \tmp_q, \output_v, \tmp_a, \c_remainder_bytes #left shift to make the rest part 0 - esp.srcxxp.2q \output_v, \tmp_q, \tmp_a, \c_remainder_bytes #right shift to lower bits -600: -.endm - - -.macro dl_esp32p4_s16_store_aligned_remainder remainder_data, c_remainder, tmp_a, output_ptr -607: # remainder == 1, 0x111 - andi t0, \c_remainder, 4 - beqz t0, 603f - andi t0, \c_remainder, 2 - beqz t0, 605f - andi t0, \c_remainder, 1 - beqz t0, 606f - esp.vst.l.64.ip \remainder_data, \output_ptr, 0 - esp.movi.32.a \remainder_data, \tmp_a, 2 - sw \tmp_a, 8(\output_ptr) - esp.movi.32.a \remainder_data, \tmp_a, 3 - sh \tmp_a, 12(\output_ptr) - j 600f -606: # remainder == 1, 0x110 - esp.vst.l.64.ip \remainder_data, \output_ptr, 0 - esp.movi.32.a \remainder_data, \tmp_a, 2 - sw \tmp_a, 8(\output_ptr) - j 600f -605: # remainder == 1, 0x101 - andi t0, \c_remainder, 1 - beqz t0, 604f - esp.vst.l.64.ip \remainder_data, \output_ptr, 0 - esp.movi.32.a \remainder_data, \tmp_a, 2 - sh \tmp_a, 8(\output_ptr) - j 600f -604: # remainder == 1, 0x100 - esp.vst.l.64.ip \remainder_data, \output_ptr, 0 - j 600f -603: # remainder == 1, 0x011 - andi t0, \c_remainder, 2 - beqz t0, 601f - andi t0, \c_remainder, 1 - beqz t0, 602f - esp.movi.32.a \remainder_data, \tmp_a, 0 - sw \tmp_a, 0(\output_ptr) - esp.movi.32.a \remainder_data, \tmp_a, 1 - sh \tmp_a, 4(\output_ptr) - j 600f -602: # remainder == 1, 0x010 - esp.movi.32.a \remainder_data, \tmp_a, 0 - sw \tmp_a, 0(\output_ptr) - j 600f -601: # remainder == 1, 0x001 - andi t0, \c_remainder, 1 - beqz t0, 600f - esp.movi.32.a \remainder_data, \tmp_a, 0 - sh \tmp_a, 0(\output_ptr) -600: -.endm - - -.macro dl_esp32p4_s16_store_remainder remainder_data, c_remainder, tmp_a, output_ptr -607: # remainder == 1, 0x111 - andi t0, \c_remainder, 4 - beqz t0, 603f - andi t0, \c_remainder, 2 - beqz t0, 605f - andi t0, \c_remainder, 1 - beqz t0, 606f - esp.movi.32.a \remainder_data, \tmp_a, 0 - sw \tmp_a, 0(\output_ptr) - esp.movi.32.a \remainder_data, \tmp_a, 1 - sw \tmp_a, 4(\output_ptr) - esp.movi.32.a \remainder_data, \tmp_a, 2 - sw \tmp_a, 8(\output_ptr) - esp.movi.32.a \remainder_data, \tmp_a, 3 - sh \tmp_a, 12(\output_ptr) - j 600f -606: # remainder == 1, 0x110 - esp.movi.32.a \remainder_data, \tmp_a, 0 - sw \tmp_a, 0(\output_ptr) - esp.movi.32.a \remainder_data, \tmp_a, 1 - sw \tmp_a, 4(\output_ptr) - esp.movi.32.a \remainder_data, \tmp_a, 2 - sw \tmp_a, 8(\output_ptr) - j 600f -605: # remainder == 1, 0x101 - andi t0, \c_remainder, 1 - beqz t0, 604f - esp.movi.32.a \remainder_data, \tmp_a, 0 - sw \tmp_a, 0(\output_ptr) - esp.movi.32.a \remainder_data, \tmp_a, 1 - sw \tmp_a, 4(\output_ptr) - esp.movi.32.a \remainder_data, \tmp_a, 2 - sh \tmp_a, 8(\output_ptr) - j 600f -604: # remainder == 1, 0x100 - esp.movi.32.a \remainder_data, \tmp_a, 0 - sw \tmp_a, 0(\output_ptr) - esp.movi.32.a \remainder_data, \tmp_a, 1 - sw \tmp_a, 4(\output_ptr) - j 600f -603: # remainder == 1, 0x011 - andi t0, \c_remainder, 2 - beqz t0, 601f - andi t0, \c_remainder, 1 - beqz t0, 602f - esp.movi.32.a \remainder_data, \tmp_a, 0 - sw \tmp_a, 0(\output_ptr) - esp.movi.32.a \remainder_data, \tmp_a, 1 - sh \tmp_a, 4(\output_ptr) - j 600f -602: # remainder == 1, 0x010 - esp.movi.32.a \remainder_data, \tmp_a, 0 - sw \tmp_a, 0(\output_ptr) - j 600f -601: # remainder == 1, 0x001 - andi t0, \c_remainder, 1 - beqz t0, 600f - esp.movi.32.a \remainder_data, \tmp_a, 0 - sh \tmp_a, 0(\output_ptr) -600: -.endm \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16_add2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16_add2d.S deleted file mode 100644 index a8d957ee..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16_add2d.S +++ /dev/null @@ -1,1599 +0,0 @@ -#include "dl_esp32p4_s16.S" -#include "dl_esp32p4_common.S" - -############################################################################################################################################################ -#### -#### esp32p4_s16_add2d_11c series -#### -############################################################################################################################################################ - -.macro dl_esp32p4_s16_rescale_add_rescale_output input0, input1, output, output_scale, output_shift - esp.zero.qacc - esp.vmulas.s16.qacc \input0, \output_scale - esp.vmulas.s16.qacc \input1, \output_scale - esp.srcmb.s16.qacc \output, \output_shift, 1 -.endm - - - .align 2 - .text - .global dl_esp32p4_s16_add2d_11c - .type dl_esp32p4_s16_add2d_11c, @function - #.section .iram1 -dl_esp32p4_s16_add2d_11c: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int16_t *output_ptr - # a1: int16_t *input0_ptr - # a2: int16_t *input1_ptr - # a3: void *args - # a4: c_div_2x_1 - # a5: c_left_x_1 - - lw a4, 68(a3) - lw a5, 72(a3) - - li t0, 1 - blt a4, t0, dl_esp32p4_s16_add2d_small_channel - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - - add t0, a4, x0 - blez t0, 1f - 0: - esp.vld.128.ip q2, a1, 16 - esp.vadd.s16.ld.incp q3, a2, q4, q0, q1 - esp.vst.128.ip q4, a0, 16 - - esp.vld.128.ip q0, a1, 16 - esp.vadd.s16.ld.incp q1, a2, q5, q2, q3 - esp.vst.128.ip q5, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - esp.vld.128.ip q2, a1, 16 - esp.vadd.s16.ld.incp q3, a2, q4, q0, q1 - esp.vst.128.ip q4, a0, 16 - li t0, 1 - beq a5, t0, 2f #remainder == 2*16byte - li t0, 2 - beq a5, t0, 3f #remainder == 3*16byte - -2: - esp.vadd.s16 q5, q2, q3 - esp.vst.128.ip q5, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret -3: - esp.vld.128.ip q0, a1, 16 - esp.vadd.s16.ld.incp q1, a2, q5, q2, q3 - esp.vst.128.ip q5, a0, 16 - - esp.vadd.s16 q4, q0, q1 - esp.vst.128.ip q4, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - -dl_esp32p4_s16_add2d_small_channel: - bltz a5, 5f - add t0, a5, x0 - blez t0, 3f - 2: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - esp.vadd.s16 q2, q0, q1 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - esp.vadd.s16 q2, q0, q1 - esp.vst.128.ip q2, a0, 16 -5: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - .align 2 - .text - .global dl_esp32p4_s16_rescale_add2d_11c - .type dl_esp32p4_s16_rescale_add2d_11c, @function - #.section .iram1 -dl_esp32p4_s16_rescale_add2d_11c: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int16_t *output_ptr - # a1: int16_t *input0_ptr - # a2: int16_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: input_shift - # t3: output_scale - # t4: output_shift - - lw a4, 64(a3) - lw a5, 88(a3) - lw t3, 96(a3) - lw t4, 92(a3) - - li t0, 1 - beq t3, t0, dl_esp32p4_s16_rescale_add2d_output - -dl_esp32p4_s16_rescale_add2d_output_scale: - sh t3, 0(sp) - add s11, sp, x0 - esp.vldbc.16.ip q7, s11, 0 # all output_scale - - # addi a4, a4, 1 - add t0, a4, x0 - blez t0, 3f - 2: - esp.ldqa.s16.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - esp.srcmb.s16.qacc q1, a5, 1 - - dl_esp32p4_s16_rescale_add_rescale_output q0, q1, q1, q7, t4 - - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - esp.ldqa.s16.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - esp.srcmb.s16.qacc q1, a5, 1 - dl_esp32p4_s16_rescale_add_rescale_output q0, q1, q1, q7, t4 - - esp.vst.128.ip q1, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - -dl_esp32p4_s16_rescale_add2d_output: - li s1, 1 - sh s1, 0(sp) - add s11, sp, x0 - esp.vldbc.16.ip q7, s11, 0 # all 1 - - esp.ldqa.s16.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - - add t0, a4, x0 - blez t0, 5f - 4: - esp.srcmb.s16.qacc q1, a5, 1 - esp.vmulas.s16.qacc.ld.ip q0, a1, 16, q0, q7 - esp.srcmb.s16.qacc q1, t4, 1 - esp.ldqa.s16.128.ip a2, 16 - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 4b - 5: - - esp.srcmb.s16.qacc q1, a5, 1 - esp.vmulas.s16.qacc q0, q7 - esp.srcmb.s16.qacc q1, t4, 1 - - esp.vst.128.ip q1, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - .align 2 - .text - .global dl_esp32p4_s16_add2d_11c_relu - .type dl_esp32p4_s16_add2d_11c_relu, @function - #.section .iram1 -dl_esp32p4_s16_add2d_11c_relu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int16_t *output_ptr - # a1: int16_t *input0_ptr - # a2: int16_t *input1_ptr - # a3: void *args - # a4: c_div_2x_1 - # a5: c_left_x - # s8: activation_alpha - # s9: activation_shift - - lw a4, 68(a3) - lw a5, 72(a3) - lw s8, 52(a3) - lw s9, 60(a3) - - - beqz a4, dl_esp32p4_s16_add2d_relu_small_channel - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - - add t0, a4, x0 - blez t0, 1f - 0: - esp.vld.128.ip q2, a1, 16 - esp.vadd.s16.ld.incp q3, a2, q4, q0, q1 - esp.vrelu.s16 q4, s8, s9 - esp.vst.128.ip q4, a0, 16 - - esp.vld.128.ip q0, a1, 16 - esp.vadd.s16.ld.incp q1, a2, q5, q2, q3 - esp.vrelu.s16 q5, s8, s9 - esp.vst.128.ip q5, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - esp.vld.128.ip q2, a1, 16 - esp.vadd.s16.ld.incp q3, a2, q4, q0, q1 - esp.vrelu.s16 q4, s8, s9 - esp.vst.128.ip q4, a0, 16 - li t0, 1 - beq a5, t0, 2f #remainder == 2*16byte - li t0, 2 - beq a5, t0, 3f #remainder == 3*16byte -2: - esp.vadd.s16 q5, q2, q3 - esp.vrelu.s16 q5, s8, s9 - esp.vst.128.ip q5, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret -3: - esp.vld.128.ip q0, a1, 16 - esp.vadd.s16.ld.incp q1, a2, q5, q2, q3 - esp.vrelu.s16 q5, s8, s9 - esp.vst.128.ip q5, a0, 16 - - esp.vadd.s16 q4, q0, q1 - esp.vrelu.s16 q4, s8, s9 - esp.vst.128.ip q4, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - -dl_esp32p4_s16_add2d_relu_small_channel: - bltz a5, 5f - add t0, a5, x0 - blez t0, 3f - 2: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - esp.vadd.s16 q2, q0, q1 - esp.vrelu.s16 q2, s8, s9 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - esp.vadd.s16 q2, q0, q1 - esp.vrelu.s16 q2, s8, s9 - esp.vst.128.ip q2, a0, 16 -5: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - .align 2 - .text - .global dl_esp32p4_s16_rescale_add2d_11c_relu - .type dl_esp32p4_s16_rescale_add2d_11c_relu, @function - #.section .iram1 -dl_esp32p4_s16_rescale_add2d_11c_relu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int16_t *output_ptr - # a1: int16_t *input0_ptr - # a2: int16_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: input_shift - # t3: output_scale - # t4: output_shift - # s8: activation_alpha - # s9: activation_shift - - lw a4, 64(a3) - lw a5, 88(a3) - lw t3, 96(a3) - lw t4, 92(a3) - lw s8, 52(a3) - lw s9, 60(a3) - - li t0, 1 - beq t3, t0, dl_esp32p4_s16_rescale_add2d_output_relu - -dl_esp32p4_s16_rescale_add2d_output_scale_relu: - sh t3, 0(sp) - add s11, sp, x0 - esp.vldbc.16.ip q7, s11, 0 # all output_scale - - add t0, a4, x0 - blez t0, 3f - 2: - esp.ldqa.s16.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - esp.srcmb.s16.qacc q1, a5, 1 - - dl_esp32p4_s16_rescale_add_rescale_output q0, q1, q1, q7, t4 - - esp.vrelu.s16 q1, s8, s9 - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - esp.ldqa.s16.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - esp.srcmb.s16.qacc q1, a5, 1 - - dl_esp32p4_s16_rescale_add_rescale_output q0, q1, q1, q7, t4 - - esp.vrelu.s16 q1, s8, s9 - esp.vst.128.ip q1, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - -dl_esp32p4_s16_rescale_add2d_output_relu: - li s1, 1 - sh s1, 0(sp) - add s11, sp, x0 - esp.vldbc.16.ip q7, s11, 0 # all 1 - - esp.ldqa.s16.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - - add t0, a4, x0 - blez t0, 5f - 4: - esp.srcmb.s16.qacc q1, a5, 1 - esp.vmulas.s16.qacc.ld.ip q0, a1, 16, q0, q7 - esp.srcmb.s16.qacc q1, t4, 1 - esp.ldqa.s16.128.ip a2, 16 - esp.vrelu.s16 q1, s8, s9 - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 4b - 5: - - esp.srcmb.s16.qacc q1, a5, 1 - esp.vmulas.s16.qacc q0, q7 - esp.srcmb.s16.qacc q1, t4, 1 - esp.vrelu.s16 q1, s8, s9 - esp.vst.128.ip q1, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - .align 2 - .text - .global dl_esp32p4_s16_add2d_11c_prelu - .type dl_esp32p4_s16_add2d_11c_prelu, @function - #.section .iram1 -dl_esp32p4_s16_add2d_11c_prelu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int16_t *output_ptr - # a1: int16_t *input0_ptr - # a2: int16_t *input1_ptr - # a3: void *args - # a4: c_div_2x_1 - # a5: c_left_x - # s8: activation_alpha_ptr - # s9: activation_shift - - lw a4, 68(a3) - lw a5, 72(a3) - lw s8, 56(a3) - lw s9, 60(a3) - - beqz a4, dl_esp32p4_s16_add2d_prelu_small_channel - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - - add t0, a4, x0 - blez t0, 1f - 0: - esp.vld.128.ip q2, a1, 16 - esp.vld.128.ip q6, s8, 16 - esp.vadd.s16.ld.incp q3, a2, q4, q0, q1 - esp.vprelu.s16 q4, q4, q6, s9 - esp.vst.128.ip q4, a0, 16 - - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q6, s8, 16 - esp.vadd.s16.ld.incp q1, a2, q5, q2, q3 - esp.vprelu.s16 q5, q5, q6, s9 - esp.vst.128.ip q5, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - esp.vld.128.ip q2, a1, 16 - esp.vld.128.ip q6, s8, 16 - esp.vadd.s16.ld.incp q3, a2, q4, q0, q1 - esp.vprelu.s16 q4, q4, q6, s9 - esp.vst.128.ip q4, a0, 16 - li t0, 1 - beq a5, t0, 2f #remainder == 2*16byte - li t0, 2 - beq a5, t0, 3f #remainder == 3*16byte - -2: - esp.vld.128.ip q6, s8, 16 - esp.vadd.s16 q5, q2, q3 - esp.vprelu.s16 q5, q5, q6, s9 - esp.vst.128.ip q5, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret -3: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q6, s8, 16 - esp.vadd.s16.ld.incp q1, a2, q5, q2, q3 - esp.vprelu.s16 q5, q5, q6, s9 - esp.vst.128.ip q5, a0, 16 - - esp.vld.128.ip q6, s8, 16 - esp.vadd.s16 q4, q0, q1 - esp.vprelu.s16 q4, q4, q6, s9 - esp.vst.128.ip q4, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - -dl_esp32p4_s16_add2d_prelu_small_channel: - bltz a5, 5f - add t0, a5, x0 - blez t0, 3f - 2: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - esp.vld.128.ip q6, s8, 16 - esp.vadd.s16 q2, q0, q1 - esp.vprelu.s16 q2, q2, q6, s9 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - esp.vld.128.ip q6, s8, 16 - esp.vadd.s16 q2, q0, q1 - esp.vprelu.s16 q2, q2, q6, s9 - esp.vst.128.ip q2, a0, 16 -5: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - .align 2 - .text - .global dl_esp32p4_s16_rescale_add2d_11c_prelu - .type dl_esp32p4_s16_rescale_add2d_11c_prelu, @function - #.section .iram1 -dl_esp32p4_s16_rescale_add2d_11c_prelu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int16_t *output_ptr - # a1: int16_t *input0_ptr - # a2: int16_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: input_shift - # t3: output_scale - # t4: output_shift - # s8: activation_alpha_ptr - # s9: activation_shift - - lw a4, 64(a3) - lw a5, 88(a3) - lw t3, 96(a3) - lw t4, 92(a3) - lw s8, 56(a3) - lw s9, 60(a3) - - li t0, 1 - beq t3, t0, dl_esp32p4_s16_rescale_add2d_output_prelu - -dl_esp32p4_s16_rescale_add2d_output_scale_prelu: - sh t3, 0(sp) - add s11, sp, x0 - esp.vldbc.16.ip q7, s11, 0 # all output_scale - - # addi a4, a4, 1 - add t0, a4, x0 - blez t0, 3f - 2: - esp.ldqa.s16.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - esp.srcmb.s16.qacc q1, a5, 1 - - dl_esp32p4_s16_rescale_add_rescale_output q0, q1, q1, q7, t4 - - esp.vld.128.ip q5, s8, 16 - - esp.vprelu.s16 q1, q1, q5, s9 - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - esp.ldqa.s16.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - esp.srcmb.s16.qacc q1, a5, 1 - - dl_esp32p4_s16_rescale_add_rescale_output q0, q1, q1, q7, t4 - - esp.vld.128.ip q5, s8, 16 - - esp.vprelu.s16 q1, q1, q5, s9 - esp.vst.128.ip q1, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - -dl_esp32p4_s16_rescale_add2d_output_prelu: - li s1, 1 - sh s1, 0(sp) - add s11, sp, x0 - esp.vldbc.16.ip q7, s11, 0 # all 1 - - esp.ldqa.s16.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - - add t0, a4, x0 - blez t0, 5f - 4: - esp.srcmb.s16.qacc q1, a5, 1 - esp.vmulas.s16.qacc.ld.ip q0, a1, 16, q0, q7 - esp.srcmb.s16.qacc q1, t4, 1 - - esp.vld.128.ip q6, s8, 16 - esp.ldqa.s16.128.ip a2, 16 - esp.vprelu.s16 q1, q1, q6, s9 - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 4b - 5: - - esp.srcmb.s16.qacc q1, a5, 1 - esp.vmulas.s16.qacc q0, q7 - - esp.vld.128.ip q6, s8, 16 - esp.srcmb.s16.qacc q1, t4, 1 - esp.vprelu.s16 q1, q1, q6, s9 - esp.vst.128.ip q1, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -############################################################################################################################################################ -#### -#### esp32p4_s16_unaligned_add2d_11c series -#### -############################################################################################################################################################ - - .align 2 - .text - .global dl_esp32p4_s16_unaligned_add2d_11c - .type dl_esp32p4_s16_unaligned_add2d_11c, @function - #.section .iram1 -dl_esp32p4_s16_unaligned_add2d_11c: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int16_t *output_ptr - # a1: int16_t *input0_ptr - # a2: int16_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: input_shift - # t3: output_scale - # t4: output_shift - # t5: c_remainder - - lw a4, 64(a3) - lw t5, 76(a3) - lw a5, 88(a3) - - bgez a5, dl_esp32p4_s16_unaligned_rescale_add2d_11c - -# input0 exp = input1 exp = output exp - - esp.ld.128.usar.ip q5, a0, 0 #get output_ptr sar_byte - esp.movx.r.sar.bytes s1 - - bltz a4, dl_esp32p4_s16_unaligned_add2d_11c_small_remainder # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - beqz s1, dl_esp32p4_s16_unaligned_add2d_11c_0 - li t0, 8 - beq s1, t0, dl_esp32p4_s16_unaligned_add2d_11c_1 - - add t0, a4, x0 - blez t0, 1f - 0: - esp.src.q.qup q2, q0, q1 - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - esp.vadd.s16 q2, q2, q5 - esp.ld.128.usar.ip q1, a1, 16 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - j dl_esp32p4_s16_unaligned_add2d_11c_remainder - - #output sar = 0 -dl_esp32p4_s16_unaligned_add2d_11c_0: - add t0, a4, x0 - blez t0, 3f - 2: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - - esp.ld.128.usar.ip q1, a1, 16 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - esp.vst.128.ip q2, a0, 16 - j dl_esp32p4_s16_unaligned_add2d_11c_remainder - - #output sar = 8 -dl_esp32p4_s16_unaligned_add2d_11c_1: - add t0, a4, x0 - blez t0, 5f - 4: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - - esp.ld.128.usar.ip q1, a1, 16 - dl_esp32p4_128b_unaligned_store1 q2, a0 - addi t0, t0, -1 - bgtz t0, 4b - 5: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - dl_esp32p4_128b_unaligned_store1 q2, a0 - j dl_esp32p4_s16_unaligned_add2d_11c_remainder - -dl_esp32p4_s16_unaligned_add2d_11c_small_remainder: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 - - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 - -dl_esp32p4_s16_unaligned_add2d_11c_remainder: - - beqz t5, dl_esp32p4_s16_unaligned_add2d_end - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - - srli t5, t5, 1 - dl_esp32p4_s16_store_remainder q2, t5, s1, a0 - -dl_esp32p4_s16_unaligned_add2d_end: - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -## rescaled add -dl_esp32p4_s16_unaligned_rescale_add2d_11c: - lw t3, 96(a3) # output_scale - lw t4, 92(a3) # output_shift - - li t0, 1 - beq t3, t0, dl_esp32p4_s16_rescale_unaligned_add2d_output_shift - - -### rescaled to output by *scale) >> shift -dl_esp32p4_s16_rescale_unaligned_add2d_output_scale: - - sw t3, 0(sp) - add s11, sp, x0 - esp.vldbc.16.ip q7, s11, 0 # all output_scale - - bltz a4, dl_esp32p4_s16_rescale_unaligned_add2d_scale_small_remainder # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - add t0, a4, x0 - blez t0, 7f - 6: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.mov.s16.qacc q5 - esp.srcmb.s16.qacc q1, a5, 1 - - dl_esp32p4_s16_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp.ld.128.usar.ip q1, a1, 16 - dl_esp32p4_128b_unaligned_store0 q2, a0, s0 - addi t0, t0, -1 - bgtz t0, 6b - 7: - - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.mov.s16.qacc q5 - esp.srcmb.s16.qacc q1, a5, 1 - - dl_esp32p4_s16_rescale_add_rescale_output q2, q1, q2, q7, t4 - - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - j dl_esp32p4_s16_rescale_unaligned_add2d_scale_remainder - - -dl_esp32p4_s16_rescale_unaligned_add2d_scale_small_remainder: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - -dl_esp32p4_s16_rescale_unaligned_add2d_scale_remainder: - beqz t5, dl_esp32p4_s16_unaligned_rescale_add2d_output_scale_end # c remainder - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.mov.s16.qacc q5 - esp.srcmb.s16.qacc q1, a5, 1 - - dl_esp32p4_s16_rescale_add_rescale_output q2, q1, q2, q7, t4 - - srli t5, t5, 1 - dl_esp32p4_s16_store_remainder q2, t5, s0, a0 - -dl_esp32p4_s16_unaligned_rescale_add2d_output_scale_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -### rescaled to output by right shift -dl_esp32p4_s16_rescale_unaligned_add2d_output_shift: - li s1, 1 - sh s1, 0(sp) - add s11, sp, x0 - esp.vldbc.16.ip q7, s11, 0 # all 1 - - bltz a4, dl_esp32p4_s16_rescale_unaligned_add2d_shift_small_remainder # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - add t0, a4, x0 - blez t0, 9f - 8: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - esp.mov.s16.qacc q5 - - esp.srcmb.s16.qacc q5, a5, 1 - esp.vmulas.s16.qacc q2, q7 - esp.srcmb.s16.qacc q5, t4, 1 - - esp.ld.128.usar.ip q1, a1, 16 - dl_esp32p4_128b_unaligned_store0 q5, a0, s1 - addi t0, t0, -1 - bgtz t0, 8b - 9: - addi a1, a1, -16 - add a1, a1, t5 - - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - esp.mov.s16.qacc q5 - - esp.srcmb.s16.qacc q5, a5, 1 - esp.vmulas.s16.qacc q2, q7 - esp.srcmb.s16.qacc q5, t4, 1 - - dl_esp32p4_128b_unaligned_store0 q5, a0, s1 - j dl_esp32p4_s16_rescale_unaligned_add2d_shift_remainder - - - -dl_esp32p4_s16_rescale_unaligned_add2d_shift_small_remainder: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - -dl_esp32p4_s16_rescale_unaligned_add2d_shift_remainder: - beqz t5, dl_esp32p4_s16_unaligned_rescale_add2d_output_shift_end # c remainder - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.mov.s16.qacc q5 - esp.srcmb.s16.qacc q5, a5, 1 - esp.vmulas.s16.qacc q2, q7 - esp.srcmb.s16.qacc q5, t4, 1 - - srli t5, t5, 1 - dl_esp32p4_s16_store_remainder q5, t5, s1, a0 - -dl_esp32p4_s16_unaligned_rescale_add2d_output_shift_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - .align 2 - .text - .global dl_esp32p4_s16_unaligned_add2d_11c_relu - .type dl_esp32p4_s16_unaligned_add2d_11c_relu, @function - #.section .iram1 -dl_esp32p4_s16_unaligned_add2d_11c_relu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int16_t *output_ptr - # a1: int16_t *input0_ptr - # a2: int16_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: input_shift - # t3: output_scale - # t4: output_shift - # t5: c_remainder - # s8: activation_alpha - # s9: activation_shift - - lw a4, 64(a3) - lw t5, 76(a3) - lw a5, 88(a3) - lw s8, 52(a3) - lw s9, 60(a3) - - bgez a5, dl_esp32p4_s16_unaligned_rescale_add2d_11c_relu - -# input0 exp = input1 exp = output exp - - esp.ld.128.usar.ip q5, a0, 0 #get output_ptr sar_byte - esp.movx.r.sar.bytes s1 - - bltz a4, dl_esp32p4_s16_unaligned_add2d_11c_small_remainder_relu # channel < 8 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - beqz s1, dl_esp32p4_s16_unaligned_add2d_11c_relu_0 - li t0, 8 - beq s1, t0, dl_esp32p4_s16_unaligned_add2d_11c_relu_1 - - - add t0, a4, x0 - blez t0, 1f - 0: - esp.src.q.qup q2, q0, q1 - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - esp.vadd.s16 q2, q2, q5 - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s16 q2, s8, s9 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - esp.vrelu.s16 q2, s8, s9 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - j dl_esp32p4_s16_unaligned_add2d_11c_remainder_relu - - #output sar = 0 -dl_esp32p4_s16_unaligned_add2d_11c_relu_0: - add t0, a4, x0 - blez t0, 3f - 2: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s16 q2, s8, s9 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - esp.vrelu.s16 q2, s8, s9 - esp.vst.128.ip q2, a0, 16 - j dl_esp32p4_s16_unaligned_add2d_11c_remainder_relu - - #output sar = 8 -dl_esp32p4_s16_unaligned_add2d_11c_relu_1: - add t0, a4, x0 - blez t0, 5f - 4: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s16 q2, s8, s9 - dl_esp32p4_128b_unaligned_store1 q2, a0 - addi t0, t0, -1 - bgtz t0, 4b - 5: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - esp.vrelu.s16 q2, s8, s9 - dl_esp32p4_128b_unaligned_store1 q2, a0 - j dl_esp32p4_s16_unaligned_add2d_11c_remainder_relu - -dl_esp32p4_s16_unaligned_add2d_11c_small_remainder_relu: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 - - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 - -dl_esp32p4_s16_unaligned_add2d_11c_remainder_relu: - - beqz t5, dl_esp32p4_s16_unaligned_add2d_end_relu - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - esp.vrelu.s16 q2, s8, s9 - srli t5, t5, 1 - dl_esp32p4_s16_store_remainder q2, t5, s1, a0 - -dl_esp32p4_s16_unaligned_add2d_end_relu: - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -## rescaled add -dl_esp32p4_s16_unaligned_rescale_add2d_11c_relu: - lw t3, 96(a3) # output_scale - lw t4, 92(a3) # output_shift - - li t0, 1 - beq t3, t0, dl_esp32p4_s16_rescale_unaligned_add2d_output_shift_relu - - -### rescaled to output by *scale) >> shift -dl_esp32p4_s16_rescale_unaligned_add2d_output_scale_relu: - - sw t3, 0(sp) - add s11, sp, x0 - esp.vldbc.16.ip q7, s11, 0 # all output_scale - - bltz a4, dl_esp32p4_s16_rescale_unaligned_add2d_scale_small_remainder_relu # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - add t0, a4, x0 - blez t0, 7f - 6: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.mov.s16.qacc q5 - esp.srcmb.s16.qacc q1, a5, 1 - - dl_esp32p4_s16_rescale_add_rescale_output q2, q1, q2, q7, t4 - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s16 q2, s8, s9 - dl_esp32p4_128b_unaligned_store0 q2, a0, s0 - addi t0, t0, -1 - bgtz t0, 6b - 7: - - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.mov.s16.qacc q5 - esp.srcmb.s16.qacc q1, a5, 1 - - dl_esp32p4_s16_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp.vrelu.s16 q2, s8, s9 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - j dl_esp32p4_s16_rescale_unaligned_add2d_scale_remainder_relu - - -dl_esp32p4_s16_rescale_unaligned_add2d_scale_small_remainder_relu: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - -dl_esp32p4_s16_rescale_unaligned_add2d_scale_remainder_relu: - beqz t5, dl_esp32p4_s16_unaligned_rescale_add2d_output_scale_end_relu # c remainder - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.mov.s16.qacc q5 - esp.srcmb.s16.qacc q1, a5, 1 - - dl_esp32p4_s16_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp.vrelu.s16 q2, s8, s9 - srli t5, t5, 1 - dl_esp32p4_s16_store_remainder q2, t5, s1, a0 - -dl_esp32p4_s16_unaligned_rescale_add2d_output_scale_end_relu: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -### rescaled to output by right shift -dl_esp32p4_s16_rescale_unaligned_add2d_output_shift_relu: - li s1, 1 - sh s1, 0(sp) - add s11, sp, x0 - esp.vldbc.16.ip q7, s11, 0 # all 1 - - bltz a4, dl_esp32p4_s16_rescale_unaligned_add2d_shift_small_remainder_relu # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - add t0, a4, x0 - blez t0, 9f - 8: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - esp.mov.s16.qacc q5 - - esp.srcmb.s16.qacc q5, a5, 1 - esp.vmulas.s16.qacc q2, q7 - esp.srcmb.s16.qacc q5, t4, 1 - - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s16 q5, s8, s9 - dl_esp32p4_128b_unaligned_store0 q5, a0, s1 - addi t0, t0, -1 - bgtz t0, 8b - 9: - addi a1, a1, -16 - add a1, a1, t5 - - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - esp.mov.s16.qacc q5 - - esp.srcmb.s16.qacc q5, a5, 1 - esp.vmulas.s16.qacc q2, q7 - esp.srcmb.s16.qacc q5, t4, 1 - - esp.vrelu.s16 q5, s8, s9 - dl_esp32p4_128b_unaligned_store0 q5, a0, s1 - j dl_esp32p4_s16_rescale_unaligned_add2d_shift_remainder_relu - - - -dl_esp32p4_s16_rescale_unaligned_add2d_shift_small_remainder_relu: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - -dl_esp32p4_s16_rescale_unaligned_add2d_shift_remainder_relu: - beqz t5, dl_esp32p4_s16_unaligned_rescale_add2d_output_shift_end_relu # c remainder - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.mov.s16.qacc q5 - esp.srcmb.s16.qacc q5, a5, 1 - esp.vmulas.s16.qacc q2, q7 - esp.srcmb.s16.qacc q5, t4, 1 - - esp.vrelu.s16 q5, s8, s9 - srli t5, t5, 1 - dl_esp32p4_s16_store_remainder q5, t5, s1, a0 - -dl_esp32p4_s16_unaligned_rescale_add2d_output_shift_end_relu: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - .align 2 - .text - .global dl_esp32p4_s16_unaligned_add2d_11c_prelu - .type dl_esp32p4_s16_unaligned_add2d_11c_prelu, @function - #.section .iram1 -dl_esp32p4_s16_unaligned_add2d_11c_prelu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int16_t *output_ptr - # a1: int16_t *input0_ptr - # a2: int16_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: input_shift - # t3: output_scale - # t4: output_shift - # t5: c_remainder - # s8: activation_alpha_ptr - # s9: activation_shift - - lw a4, 64(a3) - lw t5, 76(a3) - lw a5, 88(a3) - lw s8, 56(a3) - lw s9, 60(a3) - - bgez a5, dl_esp32p4_s16_unaligned_rescale_add2d_11c_prelu - -# input0 exp = input1 exp = output exp - - esp.ld.128.usar.ip q5, a0, 0 #get output_ptr sar_byte - esp.movx.r.sar.bytes s1 - - bltz a4, dl_esp32p4_s16_unaligned_add2d_11c_small_remainder_prelu # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - beqz s1, dl_esp32p4_s16_unaligned_add2d_11c_prelu_0 - li t0, 8 - beq s1, t0, dl_esp32p4_s16_unaligned_add2d_11c_prelu_1 - - - add t0, a4, x0 - blez t0, 1f - 0: - esp.src.q.qup q2, q0, q1 - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - esp.vadd.s16 q2, q2, q5 - - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s16 q2, q2, q6, s9 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vld.128.ip q6, s8, 16 - esp.vadd.s16 q2, q2, q5 - esp.vprelu.s16 q2, q2, q6, s9 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - j dl_esp32p4_s16_unaligned_add2d_11c_remainder_prelu - - #output sar = 0 -dl_esp32p4_s16_unaligned_add2d_11c_prelu_0: - add t0, a4, x0 - blez t0, 3f - 2: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s16 q2, q2, q6, s9 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vld.128.ip q6, s8, 16 - esp.vadd.s16 q2, q2, q5 - esp.vprelu.s16 q2, q2, q6, s9 - esp.vst.128.ip q2, a0, 16 - j dl_esp32p4_s16_unaligned_add2d_11c_remainder_prelu - - #output sar = 8 -dl_esp32p4_s16_unaligned_add2d_11c_prelu_1: - add t0, a4, x0 - blez t0, 5f - 4: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s16 q2, q2, q5 - - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s16 q2, q2, q6, s9 - dl_esp32p4_128b_unaligned_store1 q2, a0 - addi t0, t0, -1 - bgtz t0, 4b - 5: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vld.128.ip q6, s8, 16 - esp.vadd.s16 q2, q2, q5 - esp.vprelu.s16 q2, q2, q6, s9 - dl_esp32p4_128b_unaligned_store1 q2, a0 - j dl_esp32p4_s16_unaligned_add2d_11c_remainder_prelu - -dl_esp32p4_s16_unaligned_add2d_11c_small_remainder_prelu: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 - - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 - -dl_esp32p4_s16_unaligned_add2d_11c_remainder_prelu: - - beqz t5, dl_esp32p4_s16_unaligned_add2d_end_prelu - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.vld.128.ip q6, s8, 16 - esp.vadd.s16 q2, q2, q5 - esp.vprelu.s16 q2, q2, q6, s9 - srli t5, t5, 1 - dl_esp32p4_s16_store_remainder q2, t5, s1, a0 - -dl_esp32p4_s16_unaligned_add2d_end_prelu: - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -## rescaled add -dl_esp32p4_s16_unaligned_rescale_add2d_11c_prelu: - lw t3, 96(a3) # output_scale - lw t4, 92(a3) # output_shift - - li t0, 1 - beq t3, t0, dl_esp32p4_s16_rescale_unaligned_add2d_output_shift_prelu - - -### rescaled to output by *scale) >> shift -dl_esp32p4_s16_rescale_unaligned_add2d_output_scale_prelu: - - sw t3, 0(sp) - add s11, sp, x0 - esp.vldbc.16.ip q7, s11, 0 # all output_scale - # ssr t4 #output shift - # li s1, 0 - - bltz a4, dl_esp32p4_s16_rescale_unaligned_add2d_scale_small_remainder_prelu # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - add t0, a4, x0 - blez t0, 7f - 6: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.mov.s16.qacc q5 - esp.srcmb.s16.qacc q1, a5, 1 - - dl_esp32p4_s16_rescale_add_rescale_output q2, q1, q2, q7, t4 - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s16 q2, q2, q6, s9 - dl_esp32p4_128b_unaligned_store0 q2, a0, s0 - addi t0, t0, -1 - bgtz t0, 6b - 7: - - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.mov.s16.qacc q5 - esp.srcmb.s16.qacc q1, a5, 1 - - esp.vld.128.ip q6, s8, 16 - dl_esp32p4_s16_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp.vprelu.s16 q2, q2, q6, s9 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - j dl_esp32p4_s16_rescale_unaligned_add2d_scale_remainder_prelu - - -dl_esp32p4_s16_rescale_unaligned_add2d_scale_small_remainder_prelu: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - -dl_esp32p4_s16_rescale_unaligned_add2d_scale_remainder_prelu: - beqz t5, dl_esp32p4_s16_unaligned_rescale_add2d_output_scale_end_prelu # c remainder - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.mov.s16.qacc q5 - esp.srcmb.s16.qacc q1, a5, 1 - - esp.vld.128.ip q6, s8, 16 - dl_esp32p4_s16_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp.vprelu.s16 q2, q2, q6, s9 - srli t5, t5, 1 - dl_esp32p4_s16_store_remainder q2, t5, s1, a0 - -dl_esp32p4_s16_unaligned_rescale_add2d_output_scale_end_prelu: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -### rescaled to output by right shift -dl_esp32p4_s16_rescale_unaligned_add2d_output_shift_prelu: - li s1, 1 - sh s1, 0(sp) - add s11, sp, x0 - esp.vldbc.16.ip q7, s11, 0 # all 1 - - bltz a4, dl_esp32p4_s16_rescale_unaligned_add2d_shift_small_remainder_prelu # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - add t0, a4, x0 - blez t0, 9f - 8: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - esp.mov.s16.qacc q5 - - esp.srcmb.s16.qacc q5, a5, 1 - esp.vmulas.s16.qacc q2, q7 - esp.srcmb.s16.qacc q5, t4, 1 - - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s16 q5, q5, q6, s9 - dl_esp32p4_128b_unaligned_store0 q5, a0, s1 - addi t0, t0, -1 - bgtz t0, 8b - 9: - addi a1, a1, -16 - add a1, a1, t5 - - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - esp.mov.s16.qacc q5 - - esp.srcmb.s16.qacc q5, a5, 1 - esp.vmulas.s16.qacc q2, q7 - esp.vld.128.ip q6, s8, 16 - esp.srcmb.s16.qacc q5, t4, 1 - - esp.vprelu.s16 q5, q5, q6, s9 - dl_esp32p4_128b_unaligned_store0 q5, a0, s1 - j dl_esp32p4_s16_rescale_unaligned_add2d_shift_remainder_prelu - - - -dl_esp32p4_s16_rescale_unaligned_add2d_shift_small_remainder_prelu: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - -dl_esp32p4_s16_rescale_unaligned_add2d_shift_remainder_prelu: - beqz t5, dl_esp32p4_s16_unaligned_rescale_add2d_output_shift_end_prelu # c remainder - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.mov.s16.qacc q5 - esp.srcmb.s16.qacc q5, a5, 1 - esp.vmulas.s16.qacc q2, q7 - esp.vld.128.ip q6, s8, 16 - esp.srcmb.s16.qacc q5, t4, 1 - - esp.vprelu.s16 q5, q5, q6, s9 - srli t5, t5, 1 - dl_esp32p4_s16_store_remainder q5, t5, s1, a0 - -dl_esp32p4_s16_unaligned_rescale_add2d_output_shift_end_prelu: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16_conv2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16_conv2d.S deleted file mode 100644 index d2637272..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16_conv2d.S +++ /dev/null @@ -1,846 +0,0 @@ -#include "dl_esp32p4_s16.S" -#include "dl_esp32p4_common.S" - - -############################################################################################################################################################ -#### -#### esp32p4_s16_conv2d_11cn series -#### -############################################################################################################################################################ -.macro esp32p4_s16_conv2d_11c8 input_v0, filter_v0, filter_v1, input_ptr, filter_ptr, c_div_x_1 - # scalar * vecter and accumulate into qacc - # input_ptr += (c_div_x_1 + 1) * 16 in the end - # filter_ptr point to the next 16 bytes in the end - - # input_v0: 8 input elements - # filter_v0: 8 filter elements - # filter_v1: 8 filter elements - # input_ptr: input_ptr - # filter_ptr: filter_ptr - # c_div_x_1: input_channel // 8 - 1 - - esp.vld.128.ip \input_v0, \input_ptr, 16 - esp.vld.128.ip \filter_v0, \filter_ptr, 16 - esp.vld.128.ip \filter_v1, \filter_ptr, 16 - beqz \c_div_x_1, 1f - - # lp.setup 0, \c_div_x_1, 0f - esp.lp.setup 0, \c_div_x_1, 0f - esp.vsmulas.s16.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s16.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s16.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s16.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s16.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s16.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s16.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - esp.vsmulas.s16.qacc.ld.incp \input_v0, \input_ptr, \filter_v1, \input_v0, 7 - 0: esp.vld.128.ip \filter_v1, \filter_ptr, 16 - - 1: - esp.vsmulas.s16.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s16.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s16.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s16.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s16.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s16.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s16.qacc \filter_v0, \input_v0, 6 - esp.vsmulas.s16.qacc \filter_v1, \input_v0, 7 -.endm - - - -.macro esp32p4_s16_conv2d_11cn_load_args args, filter_ptr, c_div_x_1, n_rs3, mac_shift - lw \n_rs3, 96(\args) // output_channel_div_8 - lw \mac_shift, 64(\args) // mac_shift - lw \filter_ptr, 48(\args) // filter - lw \c_div_x_1, 100(\args) // input_channel / x - 1 -.endm - - - - .text - .align 2 - .global dl_esp32p4_s16_conv2d_11cn_bias - .type dl_esp32p4_s16_conv2d_11cn_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s16_conv2d_11cn_bias: - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - # a3: int16_t *filter_ptr - # a4: mac_shift - # a5: bias_ptr - # t3: - # t4: - # t5: moving_input_ptr - # t6: n_rs3 - - # a6(not for extension instructions): - # a7(not for extension instructions): - # t0(not for extension instructions): - # t1(not for extension instructions): c_div_x_1 - # t2(not for extension instructions): - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s16_conv2d_11cn_load_args a2, a3, t1, t6, a4 - lw a5, 68(a2) // bias - - esp32p4_s16_conv2d_11cn_bias_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s16_conv2d_128b_vector_bias a5 - esp32p4_s16_conv2d_11c8 q0, q1, q2, t5, a3, t1 - esp32p4_s16_128b_vector_shift_result q0, a4 - esp32p4_s16_128b_aligned_vector_store q0, a0 - - addi t6, t6, -1 - bnez t6, esp32p4_s16_conv2d_11cn_bias_loop - - ret - - - - .text - .align 2 - .global dl_esp32p4_s16_conv2d_11cn_bias_relu - .type dl_esp32p4_s16_conv2d_11cn_bias_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s16_conv2d_11cn_bias_relu: - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - # a3: int16_t *filter_ptr - # a4: mac_shift - # a5: bias_ptr - # t3: activation_alpha/_address - # t4: activation_shift - # t5: moving_input_ptr - # t6: n_rs3 - - # a6(not for extension instructions): - # a7(not for extension instructions): - # t0(not for extension instructions): - # t1(not for extension instructions): c_div_x_1 - # t2(not for extension instructions): - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s16_conv2d_11cn_load_args a2, a3, t1, t6, a4 - lw a5, 68(a2) // bias - lw t3, 76(a2) // activation_alpha - lw t4, 84(a2) // activation_shift - - esp32p4_s16_conv2d_11cn_bias_relu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s16_conv2d_128b_vector_bias a5 - esp32p4_s16_conv2d_11c8 q0, q1, q2, t5, a3, t1 - esp32p4_s16_128b_vector_shift_result q0, a4 - esp32p4_s16_128b_vector_relu q0, t3, t4 - esp32p4_s16_128b_aligned_vector_store q0, a0 - - addi t6, t6, -1 - bnez t6, esp32p4_s16_conv2d_11cn_bias_relu_loop - - ret - - - - .text - .align 2 - .global dl_esp32p4_s16_conv2d_11cn - .type dl_esp32p4_s16_conv2d_11cn, @function - .balign 4 - .option norvc -dl_esp32p4_s16_conv2d_11cn: - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - # a3: int16_t *filter_ptr - # a4: mac_shift - # a5: - # t3: - # t4: - # t5: moving_input_ptr - # t6: n_rs3 - - # a6(not for extension instructions): - # a7(not for extension instructions): - # t0(not for extension instructions): - # t1(not for extension instructions): c_div_x_1 - # t2(not for extension instructions): - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s16_conv2d_11cn_load_args a2, a3, t1, t6, a4 - - esp32p4_s16_conv2d_11cn_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s16_conv2d_11c8 q0, q1, q2, t5, a3, t1 - esp32p4_s16_128b_vector_shift_result q0, a4 - esp32p4_s16_128b_aligned_vector_store q0, a0 - - addi t6, t6, -1 - bnez t6, esp32p4_s16_conv2d_11cn_loop - - ret - - - - .text - .align 2 - .global dl_esp32p4_s16_conv2d_11cn_relu - .type dl_esp32p4_s16_conv2d_11cn_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s16_conv2d_11cn_relu: - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - # a3: int16_t *filter_ptr - # a4: mac_shift - # a5: - # t3: activation_alpha/_address - # t4: activation_shift - # t5: moving_input_ptr - # t6: n_rs3 - - # a6(not for extension instructions): - # a7(not for extension instructions): - # t0(not for extension instructions): - # t1(not for extension instructions): c_div_x_1 - # t2(not for extension instructions): - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s16_conv2d_11cn_load_args a2, a3, t1, t6, a4 - lw t3, 76(a2) // activation_alpha - lw t4, 84(a2) // activation_shift - - esp32p4_s16_conv2d_11cn_relu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s16_conv2d_11c8 q0, q1, q2, t5, a3, t1 - esp32p4_s16_128b_vector_shift_result q0, a4 - esp32p4_s16_128b_vector_relu q0, t3, t4 - esp32p4_s16_128b_aligned_vector_store q0, a0 - - addi t6, t6, -1 - bnez t6, esp32p4_s16_conv2d_11cn_relu_loop - - ret - - - - - - -############################################################################################################################################################ -#### -#### esp32p4_s16_conv2d_33cn series -#### -############################################################################################################################################################ - -.macro esp32p4_s16_conv2d_33c8 input_v0, filter_v0, filter_v1, input_ptr, filter_ptr, c_div_x_1, dilation_x_offset, dilation_y_offset - # dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(output_t) - # dilation_y_offset = (dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1)) * sizeof(output_t) - - esp32p4_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_y_offset - - esp32p4_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_y_offset - - esp32p4_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - # add \input_ptr, \input_ptr, \dilation_y_offset -.endm - - - -.macro esp32p4_s16_conv2d_hwcn_load_args args, filter_ptr, c_div_x_1, n_rs3, mac_shift, dilation_x_offset, dilation_y_offset - esp32p4_s16_conv2d_11cn_load_args \args, \filter_ptr, \c_div_x_1, \n_rs3, \mac_shift - lw \dilation_x_offset, 108(\args) // input dilation x offset - lw \dilation_y_offset, 112(\args) // input dilation y offset -.endm - - - - .text - .align 2 - .global dl_esp32p4_s16_conv2d_33cn_bias - .type dl_esp32p4_s16_conv2d_33cn_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s16_conv2d_33cn_bias: - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - # a3: int16_t *filter_ptr - # a4: mac_shift - # a5: bias_ptr - # t3: - # t4: - # t5: moving_input_ptr - # t6: n_rs3 - - # a6(not for extension instructions): input dilation y offset - # a7(not for extension instructions): - # t0(not for extension instructions): - # t1(not for extension instructions): c_div_x_1 - # t2(not for extension instructions): input dilation x offset - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s16_conv2d_hwcn_load_args a2, a3, t1, t6, a4, t2, a6 - lw a5, 68(a2) // bias - - esp32p4_s16_conv2d_33cn_bias_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s16_conv2d_128b_vector_bias a5 - esp32p4_s16_conv2d_33c8 q0, q1, q2, t5, a3, t1, t2, a6 - esp32p4_s16_128b_vector_shift_result q0, a4 - esp32p4_s16_128b_aligned_vector_store q0, a0 - - addi t6, t6, -1 - bnez t6, esp32p4_s16_conv2d_33cn_bias_loop - - ret - - - - .text - .align 2 - .global dl_esp32p4_s16_conv2d_33cn_bias_relu - .type dl_esp32p4_s16_conv2d_33cn_bias_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s16_conv2d_33cn_bias_relu: - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - # a3: int16_t *filter_ptr - # a4: mac_shift - # a5: bias_ptr - # t3: activation_alpha/_address - # t4: activation_shift - # t5: moving_input_ptr - # t6: n_rs3 - - # a6(not for extension instructions): input dilation y offset - # a7(not for extension instructions): - # t0(not for extension instructions): - # t1(not for extension instructions): c_div_x_1 - # t2(not for extension instructions): input dilation x offset - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s16_conv2d_hwcn_load_args a2, a3, t1, t6, a4, t2, a6 - lw a5, 68(a2) // bias - lw t3, 76(a2) // activation_alpha - lw t4, 84(a2) // activation_shift - - esp32p4_s16_conv2d_33cn_bias_relu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s16_conv2d_128b_vector_bias a5 - esp32p4_s16_conv2d_33c8 q0, q1, q2, t5, a3, t1, t2, a6 - esp32p4_s16_128b_vector_shift_result q0, a4 - esp32p4_s16_128b_vector_relu q0, t3, t4 - esp32p4_s16_128b_aligned_vector_store q0, a0 - - addi t6, t6, -1 - bnez t6, esp32p4_s16_conv2d_33cn_bias_relu_loop - - ret - - - - .text - .align 2 - .global dl_esp32p4_s16_conv2d_33cn - .type dl_esp32p4_s16_conv2d_33cn, @function - .balign 4 - .option norvc -dl_esp32p4_s16_conv2d_33cn: - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - # a3: int16_t *filter_ptr - # a4: mac_shift - # a5: - # t3: - # t4: - # t5: moving_input_ptr - # t6: n_rs3 - - # a6(not for extension instructions): input dilation y offset - # a7(not for extension instructions): - # t0(not for extension instructions): - # t1(not for extension instructions): c_div_x_1 - # t2(not for extension instructions): input dilation x offset - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s16_conv2d_hwcn_load_args a2, a3, t1, t6, a4, t2, a6 - - esp32p4_s16_conv2d_33cn_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s16_conv2d_33c8 q0, q1, q2, t5, a3, t1, t2, a6 - esp32p4_s16_128b_vector_shift_result q0, a4 - esp32p4_s16_128b_aligned_vector_store q0, a0 - - addi t6, t6, -1 - bnez t6, esp32p4_s16_conv2d_33cn_loop - - ret - - - - .text - .align 2 - .global dl_esp32p4_s16_conv2d_33cn_relu - .type dl_esp32p4_s16_conv2d_33cn_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s16_conv2d_33cn_relu: - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - # a3: int16_t *filter_ptr - # a4: mac_shift - # a5: - # t3: activation_alpha/_address - # t4: activation_shift - # t5: moving_input_ptr - # t6: n_rs3 - - # a6(not for extension instructions): input dilation y offset - # a7(not for extension instructions): - # t0(not for extension instructions): - # t1(not for extension instructions): c_div_x_1 - # t2(not for extension instructions): input dilation x offset - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s16_conv2d_hwcn_load_args a2, a3, t1, t6, a4, t2, a6 - lw t3, 76(a2) // activation_alpha - lw t4, 84(a2) // activation_shift - - esp32p4_s16_conv2d_33cn_relu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s16_conv2d_33c8 q0, q1, q2, t5, a3, t1, t2, a6 - esp32p4_s16_128b_vector_shift_result q0, a4 - esp32p4_s16_128b_vector_relu q0, t3, t4 - esp32p4_s16_128b_aligned_vector_store q0, a0 - - addi t6, t6, -1 - bnez t6, esp32p4_s16_conv2d_33cn_relu_loop - - ret - - - - - - -############################################################################################################################################################ -#### -#### esp32p4_s16_conv2d_hwcn series -#### -############################################################################################################################################################ -.macro esp32p4_s16_conv2d_hwc8 input_v0, filter_v0, filter_v1, input_ptr, filter_ptr, c_div_x_1, dilation_x_offset, dilation_y_offset, filter_h, filter_w, args, filter_y_offset, filter_n_offset - # dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(output_t) - # dilation_y_offset = (dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1)) * sizeof(output_t) - # filter_h - # filter_w - - lw \filter_h, 52(\args) # filter_height - 2: - lw \filter_w, 56(\args) # filter_width - addi \filter_w, \filter_w, -1 - beqz \filter_w, 4f - // lp.setup 1, \filter_w, 3f - // esp32p4_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - // 3: add \input_ptr, \input_ptr, \dilation_x_offset - 3: - esp32p4_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - addi \filter_w, \filter_w, -1 - bgtz \filter_w, 3b - 4: - esp32p4_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \filter_ptr, \filter_ptr, \filter_y_offset - add \input_ptr, \input_ptr, \dilation_y_offset - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 2b - - add \filter_ptr, \filter_ptr, \filter_n_offset -.endm - - - - .text - .align 2 - .global dl_esp32p4_s16_conv2d_hwcn_bias - .type dl_esp32p4_s16_conv2d_hwcn_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s16_conv2d_hwcn_bias: - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - # a3: int16_t *filter_ptr - # a4: mac_shift - # a5: bias_ptr - # t3: - # t4: - # t5: moving_input_ptr - # t6: n_rs3 - - # a6(not for extension instructions): input dilation y offset - # a7(not for extension instructions): filter_height - # t0(not for extension instructions): filter_width - # t1(not for extension instructions): c_div_x_1 - # t2(not for extension instructions): input dilation x offset - # s2(not for extension instructions): filter_y_offset - # s3(not for extension instructions): filter_n_offset - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_push_12_stacks_3r s2, s3, s4 - - esp32p4_s16_conv2d_hwcn_load_args a2, a3, t1, t6, a4, t2, a6 - lw s2, 60(a2) // filter_y_offset - lw s3, 144(a2) // filter_n_offset - lw a5, 68(a2) // bias - - esp32p4_s16_conv2d_hwcn_bias_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s16_conv2d_128b_vector_bias a5 - esp32p4_s16_conv2d_hwc8 q0, q1, q2, t5, a3, t1, t2, a6, a7, t0, a2, s2, s3 - esp32p4_s16_128b_vector_shift_result q0, a4 - esp32p4_s16_128b_aligned_vector_store q0, a0 - - addi t6, t6, -1 - bnez t6, esp32p4_s16_conv2d_hwcn_bias_loop - - esp32p4_pop_12_stacks_3r s2, s3, s4 - ret - - - - .text - .align 2 - .global dl_esp32p4_s16_conv2d_hwcn_bias_relu - .type dl_esp32p4_s16_conv2d_hwcn_bias_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s16_conv2d_hwcn_bias_relu: - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - # a3: int16_t *filter_ptr - # a4: mac_shift - # a5: bias_ptr - # t3: activation_alpha/_address - # t4: activation_shift - # t5: moving_input_ptr - # t6: n_rs3 - - # a6(not for extension instructions): input dilation y offset - # a7(not for extension instructions): filter_height - # t0(not for extension instructions): filter_width - # t1(not for extension instructions): c_div_x_1 - # t2(not for extension instructions): input dilation x offset - # s2(not for extension instructions): filter_y_offset - # s3(not for extension instructions): filter_n_offset - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_push_12_stacks_3r s2, s3, s4 - - esp32p4_s16_conv2d_hwcn_load_args a2, a3, t1, t6, a4, t2, a6 - lw s2, 60(a2) // filter_y_offset - lw s3, 144(a2) // filter_n_offset - lw a5, 68(a2) // bias - lw t3, 76(a2) // activation_alpha - lw t4, 84(a2) // activation_shift - - esp32p4_s16_conv2d_hwcn_bias_relu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s16_conv2d_128b_vector_bias a5 - esp32p4_s16_conv2d_hwc8 q0, q1, q2, t5, a3, t1, t2, a6, a7, t0, a2, s2, s3 - esp32p4_s16_128b_vector_shift_result q0, a4 - esp32p4_s16_128b_vector_relu q0, t3, t4 - esp32p4_s16_128b_aligned_vector_store q0, a0 - - addi t6, t6, -1 - bnez t6, esp32p4_s16_conv2d_hwcn_bias_relu_loop - - esp32p4_pop_12_stacks_3r s2, s3, s4 - ret - - - - .text - .align 2 - .global dl_esp32p4_s16_conv2d_hwcn - .type dl_esp32p4_s16_conv2d_hwcn, @function - .balign 4 - .option norvc -dl_esp32p4_s16_conv2d_hwcn: - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - # a3: int16_t *filter_ptr - # a4: mac_shift - # a5: - # t3: - # t4: - # t5: moving_input_ptr - # t6: n_rs3 - - # a6(not for extension instructions): input dilation y offset - # a7(not for extension instructions): filter_height - # t0(not for extension instructions): filter_width - # t1(not for extension instructions): c_div_x_1 - # t2(not for extension instructions): input dilation x offset - # s2(not for extension instructions): filter_y_offset - # s3(not for extension instructions): filter_n_offset - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_push_12_stacks_3r s2, s3, s4 - - esp32p4_s16_conv2d_hwcn_load_args a2, a3, t1, t6, a4, t2, a6 - lw s2, 60(a2) // filter_y_offset - lw s3, 144(a2) // filter_n_offset - - esp32p4_s16_conv2d_hwcn_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s16_conv2d_hwc8 q0, q1, q2, t5, a3, t1, t2, a6, a7, t0, a2, s2, s3 - esp32p4_s16_128b_vector_shift_result q0, a4 - esp32p4_s16_128b_aligned_vector_store q0, a0 - - addi t6, t6, -1 - bnez t6, esp32p4_s16_conv2d_hwcn_loop - - esp32p4_pop_12_stacks_3r s2, s3, s4 - ret - - - - .text - .align 2 - .global dl_esp32p4_s16_conv2d_hwcn_relu - .type dl_esp32p4_s16_conv2d_hwcn_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s16_conv2d_hwcn_relu: - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - # a3: int16_t *filter_ptr - # a4: mac_shift - # a5: - # t3: activation_alpha/_address - # t4: activation_shift - # t5: moving_input_ptr - # t6: n_rs3 - - # a6(not for extension instructions): input dilation y offset - # a7(not for extension instructions): filter_height - # t0(not for extension instructions): filter_width - # t1(not for extension instructions): c_div_x_1 - # t2(not for extension instructions): input dilation x offset - # s2(not for extension instructions): filter_y_offset - # s3(not for extension instructions): filter_n_offset - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_push_12_stacks_3r s2, s3, s4 - - esp32p4_s16_conv2d_hwcn_load_args a2, a3, t1, t6, a4, t2, a6 - lw s2, 60(a2) // filter_y_offset - lw s3, 144(a2) // filter_n_offset - lw t3, 76(a2) // activation_alpha - lw t4, 84(a2) // activation_shift - - esp32p4_s16_conv2d_hwcn_relu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s16_conv2d_hwc8 q0, q1, q2, t5, a3, t1, t2, a6, a7, t0, a2, s2, s3 - esp32p4_s16_128b_vector_shift_result q0, a4 - esp32p4_s16_128b_vector_relu q0, t3, t4 - esp32p4_s16_128b_aligned_vector_store q0, a0 - - addi t6, t6, -1 - bnez t6, esp32p4_s16_conv2d_hwcn_relu_loop - - esp32p4_pop_12_stacks_3r s2, s3, s4 - ret \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16_mul2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16_mul2d.S deleted file mode 100644 index 4720e1b7..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16_mul2d.S +++ /dev/null @@ -1,697 +0,0 @@ -#include "dl_esp32p4_s16.S" -#include "dl_esp32p4_common.S" - -############################################################################################################################################################ -#### -#### esp32p4_s16_mul2d_11c series -#### -############################################################################################################################################################ - - .align 4 - .text - .global dl_esp32p4_s16_mul2d_11c - .type dl_esp32p4_s16_mul2d_11c, @function - #.section .iram1 -dl_esp32p4_s16_mul2d_11c: - .align 4 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: mul_shift - - - lw a4, 64(a3) - lw a5, 100(a3) - bltz a4, 5f - - ESP.VLD.128.IP q0, a1, 16 - ESP.VLD.128.IP q1, a2, 16 - add t0, a4, x0 - blez t0, 1f - 0: - ESP.ZERO.QACC - ESP.VMULAS.S16.QACC.LD.IP q0, a1, 16, q0, q1 - ESP.VLD.128.IP q1, a2, 16 - ESP.SRCMB.S16.QACC q2, a5, 1 - ESP.VST.128.IP q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - ESP.ZERO.QACC - ESP.VMULAS.S16.QACC q0, q1 - ESP.SRCMB.S16.QACC q2, a5, 1 - ESP.VST.128.IP q2, a0, 16 -5: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - .align 4 - .text - .global dl_esp32p4_s16_mul2d_11c_relu - .type dl_esp32p4_s16_mul2d_11c_relu, @function - #.section .iram1 -dl_esp32p4_s16_mul2d_11c_relu: - .align 4 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: mul_shift - # s8: activation_alpha - # s9: activation_shift - - - lw a4, 64(a3) - lw a5, 100(a3) - lw t3, 76(a3) - lw s8, 52(a3) - lw s9, 60(a3) - bltz a4, 5f - - ESP.VLD.128.IP q0, a1, 16 - ESP.VLD.128.IP q1, a2, 16 - add t0, a4, x0 - blez t0, 1f - 0: - ESP.ZERO.QACC - ESP.VMULAS.S16.QACC.LD.IP q0, a1, 16, q0, q1 - ESP.VLD.128.IP q1, a2, 16 - ESP.SRCMB.S16.QACC q2, a5, 1 - ESP.VRELU.S16 q2, s8, s9 - ESP.VST.128.IP q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - ESP.ZERO.QACC - ESP.VMULAS.S16.QACC q0, q1 - ESP.SRCMB.S16.QACC q2, a5, 1 - ESP.VRELU.S16 q2, s8, s9 - ESP.VST.128.IP q2, a0, 16 -5: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - .align 4 - .text - .global dl_esp32p4_s16_mul2d_11c_prelu - .type dl_esp32p4_s16_mul2d_11c_prelu, @function - #.section .iram1 -dl_esp32p4_s16_mul2d_11c_prelu: - .align 4 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: mul_shift - # s8: activation_alpha_ptr - # s9: activation_shift - - - lw a4, 64(a3) - lw a5, 100(a3) - lw s8, 56(a3) - lw s9, 60(a3) - bltz a4, 5f - - ESP.VLD.128.IP q0, a1, 16 - ESP.VLD.128.IP q1, a2, 16 - add t0, a4, x0 - blez t0, 1f - 0: - ESP.ZERO.QACC - ESP.VMULAS.S16.QACC.LD.IP q0, a1, 16, q0, q1 - ESP.VLD.128.IP q1, a2, 16 - - ESP.VLD.128.IP q3, s8, 16 - ESP.SRCMB.S16.QACC q2, a5, 1 - ESP.VPRELU.S16 q2, q2, q3, s9 - ESP.VST.128.IP q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - ESP.ZERO.QACC - ESP.VMULAS.S16.QACC q0, q1 - ESP.VLD.128.IP q3, s8, 16 - ESP.SRCMB.S16.QACC q2, a5, 1 - ESP.VPRELU.S16 q2, q2, q3, s9 - ESP.VST.128.IP q2, a0, 16 -5: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -############################################################################################################################################################ -#### -#### esp32p4_S16_unaligned_mul2d_11c series -#### -############################################################################################################################################################ - - .align 4 - .text - .global dl_esp32p4_s16_unaligned_mul2d_11c - .type dl_esp32p4_s16_unaligned_mul2d_11c, @function - #.section .iram1 -dl_esp32p4_s16_unaligned_mul2d_11c: - .align 4 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: c_remainder - # t3: mul_shift - - - lw a4, 64(a3) - lw a5, 76(a3) - lw t3, 100(a3) - - ESP.LD.128.USAR.IP q5, a0, 0 #get output_ptr sar_byte - ESP.MOVX.R.SAR.BYTES s1 - - bltz a4, dl_tie718_S16_unaligned_mul2d_11c_small_remainder # channel < 8 - - ESP.LD.128.USAR.IP q0, a1, 16 - ESP.LD.128.USAR.IP q3, a2, 16 - ESP.LD.128.USAR.IP q1, a1, 16 - - beqz s1, dl_tie718_S16_unaligned_mul2d_11c_0 - li t0, 8 - beq s1, t0, dl_tie718_S16_unaligned_mul2d_11c_1 - - add t0, a4, x0 - blez t0, 1f - 0: - ESP.ZERO.QACC - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.IP q4, a2, 16 - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.LD.128.USAR.IP q1, a1, 16 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - addi a1, a1, -16 - add a1, a1, a5 - ESP.ZERO.QACC - ESP.MOVX.R.SAR.BYTES t6 #input0 sar - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.XP q4, a2, a5 - ESP.MOVX.R.SAR.BYTES s0 #input1 sar - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - j dl_tie718_S16_unaligned_mul2d_11c_remainder - -dl_tie718_S16_unaligned_mul2d_11c_0: - - add t0, a4, x0 - blez t0, 3f - 2: - ESP.ZERO.QACC - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.IP q4, a2, 16 - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.LD.128.USAR.IP q1, a1, 16 - ESP.VST.128.IP q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - - addi a1, a1, -16 - add a1, a1, a5 - ESP.ZERO.QACC - ESP.MOVX.R.SAR.BYTES t6 #input0 sar - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.XP q4, a2, a5 - ESP.MOVX.R.SAR.BYTES s0 #input1 sar - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.VST.128.IP q2, a0, 16 - j dl_tie718_S16_unaligned_mul2d_11c_remainder - -dl_tie718_S16_unaligned_mul2d_11c_1: - - add t0, a4, x0 - blez t0, 5f - 4: - ESP.ZERO.QACC - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.IP q4, a2, 16 - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.LD.128.USAR.IP q1, a1, 16 - dl_esp32p4_128b_unaligned_store1 q2, a0 - addi t0, t0, -1 - bgtz t0, 4b - 5: - - addi a1, a1, -16 - add a1, a1, a5 - ESP.ZERO.QACC - ESP.MOVX.R.SAR.BYTES t6 #input0 sar - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.XP q4, a2, a5 - ESP.MOVX.R.SAR.BYTES s0 #input1 sar - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - dl_esp32p4_128b_unaligned_store1 q2, a0 - - j dl_tie718_S16_unaligned_mul2d_11c_remainder - -dl_tie718_S16_unaligned_mul2d_11c_small_remainder: - ESP.LD.128.USAR.XP q0, a1, a5 - ESP.MOVX.R.SAR.BYTES t6 - - ESP.LD.128.USAR.XP q3, a2, a5 - ESP.MOVX.R.SAR.BYTES s0 - -dl_tie718_S16_unaligned_mul2d_11c_remainder: - - - beqz a5, dl_esp32p4_S16_unaligned_mul2d_11c_end - - ESP.LD.128.USAR.IP q1, a1, 0 - ESP.MOVX.W.SAR.BYTES t6 - ESP.SRC.Q q2, q0, q1 - - ESP.LD.128.USAR.IP q4, a2, 0 - ESP.MOVX.W.SAR.BYTES s0 - ESP.SRC.Q q5, q3, q4 - - ESP.ZERO.QACC - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - - srli a5, a5, 1 - dl_esp32p4_s16_store_remainder q2, a5, s0, a0 - -dl_esp32p4_S16_unaligned_mul2d_11c_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - .align 4 - .text - .global dl_esp32p4_s16_unaligned_mul2d_11c_relu - .type dl_esp32p4_s16_unaligned_mul2d_11c_relu, @function - #.section .iram1 -dl_esp32p4_s16_unaligned_mul2d_11c_relu: - .align 4 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: c_remainder - # t3: mul_shift - # s8: activation_alpha - # s9: activation_shift - - - lw a4, 64(a3) - lw a5, 76(a3) - lw t3, 100(a3) - lw s8, 52(a3) - lw s9, 60(a3) - - - - ESP.LD.128.USAR.IP q5, a0, 0 #get output_ptr sar_byte - ESP.MOVX.R.SAR.BYTES s1 - - bltz a4, dl_tie718_S16_unaligned_mul2d_11c_relu_small_remainder # channel < 8 - - - ESP.LD.128.USAR.IP q0, a1, 16 - ESP.LD.128.USAR.IP q3, a2, 16 - ESP.LD.128.USAR.IP q1, a1, 16 - - beqz s1, dl_tie718_S16_unaligned_mul2d_11c_relu_0 - li t0, 8 - beq s1, t0, dl_tie718_S16_unaligned_mul2d_11c_relu_1 - - - add t0, a4, x0 - blez t0, 1f - 0: - ESP.ZERO.QACC - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.IP q4, a2, 16 - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.LD.128.USAR.IP q1, a1, 16 - ESP.VRELU.S16 q2, s8, s9 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - addi a1, a1, -16 - add a1, a1, a5 - ESP.ZERO.QACC - ESP.MOVX.R.SAR.BYTES t6 #input0 sar - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.XP q4, a2, a5 - ESP.MOVX.R.SAR.BYTES s0 #input1 sar - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.VRELU.S16 q2, s8, s9 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - j dl_tie718_S16_unaligned_mul2d_11c_relu_remainder - -dl_tie718_S16_unaligned_mul2d_11c_relu_0: - - add t0, a4, x0 - blez t0, 3f - 2: - ESP.ZERO.QACC - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.IP q4, a2, 16 - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.LD.128.USAR.IP q1, a1, 16 - ESP.VRELU.S16 q2, s8, s9 - ESP.VST.128.IP q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - addi a1, a1, -16 - add a1, a1, a5 - ESP.ZERO.QACC - ESP.MOVX.R.SAR.BYTES t6 #input0 sar - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.XP q4, a2, a5 - ESP.MOVX.R.SAR.BYTES s0 #input1 sar - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.VRELU.S16 q2, s8, s9 - ESP.VST.128.IP q2, a0, 16 - j dl_tie718_S16_unaligned_mul2d_11c_relu_remainder - -dl_tie718_S16_unaligned_mul2d_11c_relu_1: - - add t0, a4, x0 - blez t0, 5f - 4: - ESP.ZERO.QACC - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.IP q4, a2, 16 - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.LD.128.USAR.IP q1, a1, 16 - ESP.VRELU.S16 q2, s8, s9 - dl_esp32p4_128b_unaligned_store1 q2, a0 - addi t0, t0, -1 - bgtz t0, 4b - 5: - addi a1, a1, -16 - add a1, a1, a5 - ESP.ZERO.QACC - ESP.MOVX.R.SAR.BYTES t6 #input0 sar - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.XP q4, a2, a5 - ESP.MOVX.R.SAR.BYTES s0 #input1 sar - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.VRELU.S16 q2, s8, s9 - dl_esp32p4_128b_unaligned_store1 q2, a0 - j dl_tie718_S16_unaligned_mul2d_11c_relu_remainder - -dl_tie718_S16_unaligned_mul2d_11c_relu_small_remainder: - ESP.LD.128.USAR.XP q0, a1, a5 - ESP.MOVX.R.SAR.BYTES t6 - - ESP.LD.128.USAR.XP q3, a2, a5 - ESP.MOVX.R.SAR.BYTES s0 - -dl_tie718_S16_unaligned_mul2d_11c_relu_remainder: - - - beqz a5, dl_esp32p4_S16_unaligned_mul2d_11c_relu_end - - ESP.LD.128.USAR.IP q1, a1, 0 - ESP.MOVX.W.SAR.BYTES t6 - ESP.SRC.Q q2, q0, q1 - - ESP.LD.128.USAR.IP q4, a2, 0 - ESP.MOVX.W.SAR.BYTES s0 - ESP.SRC.Q q5, q3, q4 - - ESP.ZERO.QACC - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.VRELU.S16 q2, s8, s9 - srli a5, a5, 1 - dl_esp32p4_s16_store_remainder q2, a5, s0, a0 - -dl_esp32p4_S16_unaligned_mul2d_11c_relu_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - .align 4 - .text - .global dl_esp32p4_s16_unaligned_mul2d_11c_prelu - .type dl_esp32p4_s16_unaligned_mul2d_11c_prelu, @function - #.section .iram1 -dl_esp32p4_s16_unaligned_mul2d_11c_prelu: - .align 4 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: c_remainder - # t3: mul_shift - # s8: activation_alpha_ptr - # s9: activation_shift - - - lw a4, 64(a3) - lw a5, 76(a3) - lw t3, 100(a3) - lw s8, 56(a3) - lw s9, 60(a3) - - - - ESP.LD.128.USAR.IP q5, a0, 0 #get output_ptr sar_byte - ESP.MOVX.R.SAR.BYTES s1 - - bltz a4, dl_tie718_S16_unaligned_mul2d_11c_prelu_small_remainder # channel < 8 - - - ESP.LD.128.USAR.IP q0, a1, 16 - ESP.LD.128.USAR.IP q3, a2, 16 - ESP.LD.128.USAR.IP q1, a1, 16 - - beqz s1, dl_tie718_S16_unaligned_mul2d_11c_prelu_0 - li t0, 8 - beq s1, t0, dl_tie718_S16_unaligned_mul2d_11c_prelu_1 - - - add t0, a4, x0 - blez t0, 1f - 0: - ESP.ZERO.QACC - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.IP q4, a2, 16 - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - - ESP.VLD.128.IP q6, s8, 16 - ESP.LD.128.USAR.IP q1, a1, 16 - ESP.VPRELU.S16 q2, q2, q6, s9 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - addi a1, a1, -16 - add a1, a1, a5 - ESP.ZERO.QACC - ESP.MOVX.R.SAR.BYTES t6 #input0 sar - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.XP q4, a2, a5 - ESP.MOVX.R.SAR.BYTES s0 #input1 sar - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.VLD.128.IP q6, s8, 16 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.VPRELU.S16 q2, q2, q6, s9 - dl_esp32p4_128b_unaligned_store0 q2, a0, s1 - j dl_tie718_S16_unaligned_mul2d_11c_prelu_remainder - -dl_tie718_S16_unaligned_mul2d_11c_prelu_0: - - add t0, a4, x0 - blez t0, 3f - 2: - ESP.ZERO.QACC - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.IP q4, a2, 16 - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.VLD.128.IP q6, s8, 16 - ESP.LD.128.USAR.IP q1, a1, 16 - ESP.VPRELU.S16 q2, q2, q6, s9 - ESP.VST.128.IP q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - addi a1, a1, -16 - add a1, a1, a5 - ESP.ZERO.QACC - ESP.MOVX.R.SAR.BYTES t6 #input0 sar - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.XP q4, a2, a5 - ESP.MOVX.R.SAR.BYTES s0 #input1 sar - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.VLD.128.IP q6, s8, 16 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.VPRELU.S16 q2, q2, q6, s9 - ESP.VST.128.IP q2, a0, 16 - j dl_tie718_S16_unaligned_mul2d_11c_prelu_remainder - -dl_tie718_S16_unaligned_mul2d_11c_prelu_1: - - add t0, a4, x0 - blez t0, 5f - 4: - ESP.ZERO.QACC - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.IP q4, a2, 16 - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.VLD.128.IP q6, s8, 16 - ESP.LD.128.USAR.IP q1, a1, 16 - ESP.VPRELU.S16 q2, q2, q6, s9 - dl_esp32p4_128b_unaligned_store1 q2, a0 - addi t0, t0, -1 - bgtz t0, 4b - 5: - addi a1, a1, -16 - add a1, a1, a5 - ESP.ZERO.QACC - ESP.MOVX.R.SAR.BYTES t6 #input0 sar - ESP.SRC.Q.QUP q2, q0, q1 - - ESP.LD.128.USAR.XP q4, a2, a5 - ESP.MOVX.R.SAR.BYTES s0 #input1 sar - ESP.SRC.Q.QUP q5, q3, q4 - - ESP.VMULAS.S16.QACC q2, q5 - ESP.VLD.128.IP q6, s8, 16 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.VPRELU.S16 q2, q2, q6, s9 - dl_esp32p4_128b_unaligned_store1 q2, a0 - j dl_tie718_S16_unaligned_mul2d_11c_prelu_remainder - - -dl_tie718_S16_unaligned_mul2d_11c_prelu_small_remainder: - ESP.LD.128.USAR.XP q0, a1, a5 - ESP.MOVX.R.SAR.BYTES t6 - - ESP.LD.128.USAR.XP q3, a2, a5 - ESP.MOVX.R.SAR.BYTES s0 - -dl_tie718_S16_unaligned_mul2d_11c_prelu_remainder: - - beqz a5, dl_esp32p4_S16_unaligned_mul2d_11c_prelu_end - - ESP.LD.128.USAR.IP q1, a1, 0 - ESP.MOVX.W.SAR.BYTES t6 - ESP.SRC.Q q2, q0, q1 - - ESP.LD.128.USAR.IP q4, a2, 0 - ESP.MOVX.W.SAR.BYTES s0 - ESP.SRC.Q q5, q3, q4 - - ESP.ZERO.QACC - ESP.VMULAS.S16.QACC q2, q5 - ESP.VLD.128.IP q6, s8, 16 - ESP.SRCMB.S16.QACC q2, t3, 1 - ESP.VPRELU.S16 q2, q2, q6, s9 - srli a5, a5, 1 - dl_esp32p4_s16_store_remainder q2, a5, s0, a0 - -dl_esp32p4_S16_unaligned_mul2d_11c_prelu_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8.S deleted file mode 100644 index cdcfece7..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8.S +++ /dev/null @@ -1,252 +0,0 @@ -#pragma once - - -############################################################################################################################################################ -# result process for conv2d / depthwise_conv2d -############################################################################################################################################################ -.macro esp32p4_s8_conv2d_128b_vector_bias bias_ptr - esp.ld.qacc.l.l.128.ip \bias_ptr, 16 - esp.ld.qacc.l.h.128.ip \bias_ptr, 16 - esp.ld.qacc.h.l.128.ip \bias_ptr, 16 - esp.ld.qacc.h.h.128.ip \bias_ptr, 16 -.endm - - - -.macro esp32p4_s8_conv2d_element_bias bias_ptr, tmp - lw \tmp, 0(\bias_ptr) - addi \bias_ptr, \bias_ptr, 4 - esp.movx.w.xacc.l \tmp - slti \tmp, \tmp, 0 // if tmp < 0, tmp = 1, otherwise tmp = 0 - slli \tmp, \tmp, 31 // shift left to the sign bit. - srai \tmp, \tmp, 31 // extend the sign bit to all bits. - esp.movx.w.xacc.h \tmp -.endm - - - -############################################################################################################################################################ -# esp32p4_s8_32b_unaligned_vector series -############################################################################################################################################################ -.macro esp32p4_s8_32b_unaligned_vector_store output_v, output_ptr, tmp - esp.movi.32.a \output_v, \tmp, 0 - sw \tmp, 0(\output_ptr) - esp.movi.32.a \output_v, \tmp, 1 - sw \tmp, 4(\output_ptr) - esp.movi.32.a \output_v, \tmp, 2 - sw \tmp, 8(\output_ptr) - esp.movi.32.a \output_v, \tmp, 3 - sw \tmp, 12(\output_ptr) - addi \output_ptr, \output_ptr, 16 -.endm - - - -############################################################################################################################################################ -# esp32p4_s8_64b_unaligned_vector series -############################################################################################################################################################ -.macro esp32p4_s8_64b_unaligned_vector_store output_v, output_ptr - esp.vst.l.64.ip \output_v, \output_ptr, 8 - esp.vst.h.64.ip \output_v, \output_ptr, 8 -.endm - - - -############################################################################################################################################################ -# esp32p4_s8_128b_vector series -############################################################################################################################################################ -.macro esp32p4_s8_128b_vector_shift_result output_v, mac_shift - esp.srcmb.s8.qacc \output_v, \mac_shift, 1 -.endm - - - -.macro esp32p4_s8_128b_aligned_vector_store output_v, output_ptr - esp.vst.128.ip \output_v, \output_ptr, 16 -.endm - - - -.macro esp32p4_s8_128b_vector_relu output_v, activation_alpha, activation_shift - # LeakyReLU - esp.vrelu.s8 \output_v, \activation_alpha, \activation_shift -.endm - -.macro esp32p4_s8_128b_vector_prelu output_v, activation_alpha_v, activation_alpha_ptr, activation_shift - esp.vld.128.ip \activation_alpha_v, \activation_alpha_ptr, 16 - esp.vprelu.s8 \output_v, \output_v, \activation_alpha_v, \activation_shift -.endm - - - -.macro dl_esp32p4_s8_last_store_data tmp_q, output_v, tmp_a, c_remainder_bytes - movi \tmp_a, 15 - sub \tmp_a, \tmp_a, \c_remainder_bytes - movi \c_remainder_bytes, 0 - esp.slcxxp.2q \tmp_q, \output_v, \tmp_a, \c_remainder_bytes #left shift to make the rest part 0 - esp.srcxxp.2q \output_v, \tmp_q, \tmp_a, \c_remainder_bytes #right shift to lower bits -.endm - - - -.macro dl_esp32p4_s8_store_remainder output_v, tmp_a0, tmp_a1, tmp_a2, tmp_a3, tmp_a4, output_ptr, remainder_c - esp.movi.32.a \output_v, \tmp_a0, 0 -615: # remainder_c == 15, 0x1111 - andi \tmp_a4, \remainder_c, 8 - beqz \tmp_a4, 607f - esp.movi.32.a \output_v, \tmp_a1, 1 - andi \tmp_a4, \remainder_c, 4 - beqz \tmp_a4, 611f - esp.movi.32.a \output_v, \tmp_a2, 2 - andi \tmp_a4, \remainder_c, 2 - beqz \tmp_a4, 613f - esp.movi.32.a \output_v, \tmp_a3, 3 - andi \tmp_a4, \remainder_c, 1 - beqz \tmp_a4, 614f - - sw \tmp_a0, 0(\output_ptr) - sw \tmp_a1, 4(\output_ptr) - sw \tmp_a2, 8(\output_ptr) - sh \tmp_a3, 12(\output_ptr) - srai \tmp_a3, \tmp_a3, 16 - sb \tmp_a3, 14(\output_ptr) - j 616f - -614: # remainder_c == 14, 0x1110 - sw \tmp_a0, 0(\output_ptr) - sw \tmp_a1, 4(\output_ptr) - sw \tmp_a2, 8(\output_ptr) - sh \tmp_a3, 12(\output_ptr) - j 616f - -613: # remainder_c == 13, 0x1101 - andi \tmp_a4, \remainder_c, 1 - beqz \tmp_a4, 612f - esp.movi.32.a \output_v, \tmp_a3, 3 - sw \tmp_a0, 0(\output_ptr) - sw \tmp_a1, 4(\output_ptr) - sw \tmp_a2, 8(\output_ptr) - sb \tmp_a3, 12(\output_ptr) - j 616f - -612: # remainder_c == 12, 0x1100 - sw \tmp_a0, 0(\output_ptr) - sw \tmp_a1, 4(\output_ptr) - sw \tmp_a2, 8(\output_ptr) - j 616f - -611: # remainder_c == 11, 0x1011 - andi \tmp_a4, \remainder_c, 2 - beqz \tmp_a4, 609f - esp.movi.32.a \output_v, \tmp_a2, 2 - andi \tmp_a4, \remainder_c, 1 - beqz \tmp_a4, 610f - sw \tmp_a0, 0(\output_ptr) - sw \tmp_a1, 4(\output_ptr) - sh \tmp_a2, 8(\output_ptr) - srai \tmp_a2, \tmp_a2, 16 - sb \tmp_a2, 10(\output_ptr) - j 616f -610: # remainder_c == 10, 0x1010 - sw \tmp_a0, 0(\output_ptr) - sw \tmp_a1, 4(\output_ptr) - sh \tmp_a2, 8(\output_ptr) - j 616f -609: # remainder_c == 9, 0x1001 - andi \tmp_a4, \remainder_c, 1 - beqz \tmp_a4, 608f - esp.movi.32.a \output_v, \tmp_a2, 2 - sw \tmp_a0, 0(\output_ptr) - sw \tmp_a1, 4(\output_ptr) - sb \tmp_a2, 8(\output_ptr) - j 616f -608: # remainder_c == 8, 0x1000 - sw \tmp_a0, 0(\output_ptr) - sw \tmp_a1, 4(\output_ptr) - j 616f - -607: # remainder == 7, 0x111 - andi \tmp_a4, \remainder_c, 4 - beqz \tmp_a4, 603f - andi \tmp_a4, \remainder_c, 2 - beqz \tmp_a4, 605f - esp.movi.32.a \output_v, \tmp_a1, 1 - andi \tmp_a4, \remainder_c, 1 - beqz \tmp_a4, 606f - sw \tmp_a0, 0(\output_ptr) - sh \tmp_a1, 4(\output_ptr) - srai \tmp_a1, \tmp_a1, 16 - sb \tmp_a1, 6(\output_ptr) - j 616f - -606: # remainder == 6, 0x110 - sw \tmp_a0, 0(\output_ptr) - sh \tmp_a1, 4(\output_ptr) - j 616f - -605: # remainder == 4, 5 - andi \tmp_a4, \remainder_c, 1 - beqz \tmp_a4, 604f - # remainder == 5, 0x101 - esp.movi.32.a \output_v, \tmp_a1, 1 - sw \tmp_a0, 0(\output_ptr) - sb \tmp_a1, 4(\output_ptr) - j 616f - -604: # remainder == 4, 0x100 - sw \tmp_a0, 0(\output_ptr) - j 616f - -603: # remainder == 1, 2, 3 - andi \tmp_a4, \remainder_c, 2 - beqz \tmp_a4, 601f - andi \tmp_a4, \remainder_c, 1 - beqz \tmp_a4, 602f - # remainder == 3, 0x011 - sh \tmp_a0, 0(\output_ptr) - srai \tmp_a0, \tmp_a0, 16 - sb \tmp_a0, 2(\output_ptr) - j 616f - -602: # remainder == 2, 0x010 - sh \tmp_a0, 0(\output_ptr) - j 616f - -601: # remainder == 1, 0x001 - sb \tmp_a0, 0(\output_ptr) - -616: -.endm - - - -############################################################################################################################################################ -# esp32p4_s8_element series -############################################################################################################################################################ -.macro esp32p4_s8_element_result output, mac_shift - esp.srs.s.xacc \output, \mac_shift -.endm - - - -.macro esp32p4_s8_element_store output_ptr, output - sb \output, 0(\output_ptr) - addi \output_ptr, \output_ptr, 1 -.endm - - - -.macro esp32p4_s8_element_leakyrelu output, alpha, shift - bgez \output, 0f - mul \output, \output, \alpha - sra \output, \output, \shift - 0: -.endm - -.macro esp32p4_s8_element_prelu output, alpha_ptr, shift - bgez \output, 0f - mul \output, \output, \alpha_ptr - sra \output, \output, \shift - addi \alpha_ptr, \alpha_ptr, 1 - 0: -.endm \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_add2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_add2d.S deleted file mode 100644 index 92a69498..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_add2d.S +++ /dev/null @@ -1,1680 +0,0 @@ -#include "dl_esp32p4_s8.S" -#include "dl_esp32p4_common.S" - -############################################################################################################################################################ -#### -#### esp32p4_s8_add2d_11c series -#### -############################################################################################################################################################ - -.macro dl_esp32p4_rescale_add_rescale_output input0, input1, output, output_scale, output_shift - esp.zero.qacc - esp.vmulas.s8.qacc \input0, \output_scale - esp.vmulas.s8.qacc \input1, \output_scale - esp.srcmb.s8.qacc \output, \output_shift, 1 -.endm - - - - .align 2 - .text - .global dl_esp32p4_s8_add2d_11c - .type dl_esp32p4_s8_add2d_11c, @function - #.section .iram1 -dl_esp32p4_s8_add2d_11c: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_2x_1 - # a5: c_left_x_1 - - lw a4, 68(a3) - lw a5, 72(a3) - - li t0, 1 - blt a4, t0, dl_esp32p4_s8_add2d_small_channel - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - add t0, a4, x0 - blez t0, 1f - 0: - esp.vld.128.ip q2, a1, 16 - esp.vadd.s8.ld.incp q3, a2, q4, q0, q1 - esp.vst.128.ip q4, a0, 16 - - esp.vld.128.ip q0, a1, 16 - esp.vadd.s8.ld.incp q1, a2, q5, q2, q3 - esp.vst.128.ip q5, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - li t0, 1 - beq a5, t0, 2f #remainder == 2*16byte - li t0, 2 - beq a5, t0, 3f #remainder == 3*16byte - - 2: - esp.vld.128.ip q2, a1, 16 - esp.vadd.s8.ld.incp q3, a2, q4, q0, q1 - esp.vst.128.ip q4, a0, 16 - - esp.vadd.s8 q5, q2, q3 - esp.vst.128.ip q5, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - 3: - esp.vld.128.ip q2, a1, 16 - esp.vadd.s8.ld.incp q3, a2, q4, q0, q1 - esp.vst.128.ip q4, a0, 16 - - esp.vld.128.ip q0, a1, 16 - esp.vadd.s8.ld.incp q1, a2, q5, q2, q3 - esp.vst.128.ip q5, a0, 16 - - esp.vadd.s8 q4, q0, q1 - esp.vst.128.ip q4, a0, 16 - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -dl_esp32p4_s8_add2d_small_channel: # channel < 3*s (16) - - add t0, a5, x0 - blez t0, 1f - 0: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - - esp.vadd.s8 q2, q0, q1 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - - esp.vadd.s8 q2, q0, q1 - esp.vst.128.ip q2, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - .align 2 - .text - .global dl_esp32p4_s8_rescale_add2d_11c - .type dl_esp32p4_s8_rescale_add2d_11c, @function - #.section .iram1 -dl_esp32p4_s8_rescale_add2d_11c: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr: >> shift or *scale) >> shift - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr: input1 >> shift + input0 * 1 - # a3: void *args - # a4: c_div_x_1 - # a5: input_shift - # t3: output_scale - # t4: output_shift - - - lw a4, 64(a3) - lw a5, 88(a3) - lw t3, 96(a3) - lw t4, 92(a3) - - li t0, 1 - beq t3, t0, dl_esp32p4_s8_rescale_add2d_output - -dl_esp32p4_s8_rescale_add2d_output_scale: # *scale) >> shift - - sb t3, 0(sp) - add s11, sp, x0 - esp.vldbc.8.ip q7, s11, 0 # all output_scale - - add t0, a4, x0 - blez t0, 1f - 0: - esp.ldqa.s8.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - esp.srcmb.s8.qacc q1, a5, 1 - - dl_esp32p4_rescale_add_rescale_output q0, q1, q1, q7, t4 - - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - esp.ldqa.s8.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - esp.srcmb.s8.qacc q1, a5, 1 - - dl_esp32p4_rescale_add_rescale_output q0, q1, q1, q7, t4 - esp.vst.128.ip q1, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - -dl_esp32p4_s8_rescale_add2d_output: # >> shift - li s1, 1 - sb s1, 0(sp) - add s11, sp, x0 - esp.vldbc.8.ip q7, s11, 0 # all 1 - - esp.ldqa.s8.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - add t0, a4, x0 - blez t0, 3f - 2: - esp.srcmb.s8.qacc q1, a5, 1 - esp.vmulas.s8.qacc.ld.ip q0, a1, 16, q0, q7 - esp.srcmb.s8.qacc q1, t4, 1 - esp.ldqa.s8.128.ip a2, 16 - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - - esp.srcmb.s8.qacc q1, a5, 1 - esp.vmulas.s8.qacc q0, q7 - esp.srcmb.s8.qacc q1, t4, 1 - esp.vst.128.ip q1, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - - .align 2 - .text - .global dl_esp32p4_s8_add2d_11c_relu - .type dl_esp32p4_s8_add2d_11c_relu, @function - #.section .iram1 -dl_esp32p4_s8_add2d_11c_relu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_2x_1 - # a5: c_left_x_1 - # s8: activation_alpha - # s9: activation_shift - - - lw a4, 68(a3) - lw a5, 72(a3) - lw s8, 52(a3) - lw s9, 60(a3) - - li t0, 1 - blt a4, t0, dl_esp32p4_s8_add2d_relu_small_channel - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - add t0, a4, x0 - blez t0, 1f - 0: - esp.vld.128.ip q2, a1, 16 - esp.vadd.s8.ld.incp q3, a2, q4, q0, q1 - esp.vrelu.s8 q4, s8, s9 - esp.vst.128.ip q4, a0, 16 - - esp.vld.128.ip q0, a1, 16 - esp.vadd.s8.ld.incp q1, a2, q5, q2, q3 - esp.vrelu.s8 q5, s8, s9 - esp.vst.128.ip q5, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - li t0, 1 - beq a5, t0, 2f #remainder == 2*16byte - li t0, 2 - beq a5, t0, 3f #remainder == 3*16byte - - 2: - esp.vld.128.ip q2, a1, 16 - esp.vadd.s8.ld.incp q3, a2, q4, q0, q1 - esp.vrelu.s8 q4, s8, s9 - esp.vst.128.ip q4, a0, 16 - - esp.vadd.s8 q5, q2, q3 - esp.vrelu.s8 q5, s8, s9 - esp.vst.128.ip q5, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - 3: - esp.vld.128.ip q2, a1, 16 - esp.vadd.s8.ld.incp q3, a2, q4, q0, q1 - esp.vrelu.s8 q4, s8, s9 - esp.vst.128.ip q4, a0, 16 - - esp.vld.128.ip q0, a1, 16 - esp.vadd.s8.ld.incp q1, a2, q5, q2, q3 - esp.vrelu.s8 q5, s8, s9 - esp.vst.128.ip q5, a0, 16 - - esp.vadd.s8 q4, q0, q1 - esp.vrelu.s8 q4, s8, s9 - esp.vst.128.ip q4, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -dl_esp32p4_s8_add2d_relu_small_channel: # channel < 3*16byte - - add t0, a5, x0 - blez t0, 1f - 0: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - - esp.vadd.s8 q2, q0, q1 - - esp.vrelu.s8 q2, s8, s9 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - - esp.vadd.s8 q2, q0, q1 - esp.vrelu.s8 q2, s8, s9 - esp.vst.128.ip q2, a0, 16 - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - - .align 2 - .text - .global dl_esp32p4_s8_rescale_add2d_11c_relu - .type dl_esp32p4_s8_rescale_add2d_11c_relu, @function - #.section .iram1 -dl_esp32p4_s8_rescale_add2d_11c_relu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr: >> shift or *scale) >> shift - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr: input1 >> shift + input0 * 1 - # a3: void *args - # a4: c_div_x_1 - # a5: input_shift - # t3: output_scale - # t4: output_shift - # s8: activation_alpha - # s9: activation_shift - - - lw a4, 64(a3) - lw a5, 88(a3) - lw t3, 96(a3) - lw t4, 92(a3) - lw s8, 52(a3) - lw s9, 60(a3) - - - li t0, 1 - beq t3, t0, dl_esp32p4_s8_rescale_add2d_output_relu - -dl_esp32p4_s8_rescale_add2d_output_scale_relu: # *scale) >> shift - - sb t3, 0(sp) - add s11, sp, x0 - esp.vldbc.8.ip q7, s11, 0 # all output_scale - - add t0, a4, x0 - blez t0, 1f - 0: - esp.ldqa.s8.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - esp.srcmb.s8.qacc q1, a5, 1 - - dl_esp32p4_rescale_add_rescale_output q0, q1, q1, q7, t4 - - esp.vrelu.s8 q1, s8, s9 - - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - esp.ldqa.s8.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - esp.srcmb.s8.qacc q1, a5, 1 - - dl_esp32p4_rescale_add_rescale_output q0, q1, q1, q7, t4 - esp.vrelu.s8 q1, s8, s9 - esp.vst.128.ip q1, a0, 16 - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - -dl_esp32p4_s8_rescale_add2d_output_relu: # >> shift - li s1, 1 - sb s1, 0(sp) - add s11, sp, x0 - esp.vldbc.8.ip q7, s11, 0 # all 1 - - esp.ldqa.s8.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - add t0, a4, x0 - blez t0, 1f - 0: - esp.srcmb.s8.qacc q1, a5, 1 - esp.vmulas.s8.qacc.ld.ip q0, a1, 16, q0, q7 - esp.srcmb.s8.qacc q1, t4, 1 - esp.vrelu.s8 q1, s8, s9 - esp.ldqa.s8.128.ip a2, 16 - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - esp.srcmb.s8.qacc q1, a5, 1 - esp.vmulas.s8.qacc q0, q7 - esp.srcmb.s8.qacc q1, t4, 1 - - esp.vrelu.s8 q1, s8, s9 - esp.vst.128.ip q1, a0, 16 - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - - .align 2 - .text - .global dl_esp32p4_s8_add2d_11c_prelu - .type dl_esp32p4_s8_add2d_11c_prelu, @function - #.section .iram1 -dl_esp32p4_s8_add2d_11c_prelu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_2x_1 - # a5: c_left_x_1 - # s8: activation_alpha_ptr - # s9: activation_shift - - - lw a4, 68(a3) - lw a5, 72(a3) - lw s8, 56(a3) - lw s9, 60(a3) - - li t0, 1 - blt a4, t0, dl_esp32p4_s8_add2d_prelu_small_channel - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - add t0, a4, x0 - blez t0, 1f - 0: - esp.vld.128.ip q2, a1, 16 - esp.vld.128.ip q6, s8, 16 - esp.vadd.s8.ld.incp q3, a2, q4, q0, q1 - esp.vprelu.s8 q4, q4, q6, s9 - esp.vst.128.ip q4, a0, 16 - - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q6, s8, 16 - esp.vadd.s8.ld.incp q1, a2, q5, q2, q3 - esp.vprelu.s8 q5, q5, q6, s9 - esp.vst.128.ip q5, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - li t0, 1 - beq a5, t0, 2f #remainder == 2*16byte - li t0, 2 - beq a5, t0, 3f #remainder == 3*16byte - - 2: - esp.vld.128.ip q2, a1, 16 - esp.vld.128.ip q6, s8, 16 - esp.vadd.s8.ld.incp q3, a2, q4, q0, q1 - esp.vprelu.s8 q4, q4, q6, s9 - esp.vst.128.ip q4, a0, 16 - - esp.vld.128.ip q6, s8, 16 - esp.vadd.s8 q5, q2, q3 - - esp.vprelu.s8 q5, q5, q6, s9 - esp.vst.128.ip q5, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - 3: - esp.vld.128.ip q2, a1, 16 - esp.vld.128.ip q6, s8, 16 - esp.vadd.s8.ld.incp q3, a2, q4, q0, q1 - esp.vprelu.s8 q4, q4, q6, s9 - esp.vst.128.ip q4, a0, 16 - - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q6, s8, 16 - esp.vadd.s8.ld.incp q1, a2, q5, q2, q3 - esp.vprelu.s8 q5, q5, q6, s9 - esp.vst.128.ip q5, a0, 16 - - esp.vld.128.ip q6, s8, 16 - esp.vadd.s8 q4, q0, q1 - - esp.vprelu.s8 q4, q4, q6, s9 - esp.vst.128.ip q4, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -dl_esp32p4_s8_add2d_prelu_small_channel: # channel < 3*s - - add t0, a5, x0 - blez t0, 1f - 0: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - - esp.vld.128.ip q3, s8, 16 - esp.vadd.s8 q2, q0, q1 - - esp.vprelu.s8 q2, q2, q3, s9 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - - esp.vld.128.ip q3, s8, 16 - esp.vadd.s8 q2, q0, q1 - - esp.vprelu.s8 q2, q2, q3, s9 - esp.vst.128.ip q2, a0, 16 - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - .align 2 - .text - .global dl_esp32p4_s8_rescale_add2d_11c_prelu - .type dl_esp32p4_s8_rescale_add2d_11c_prelu, @function - #.section .iram1 -dl_esp32p4_s8_rescale_add2d_11c_prelu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr: >> shift or *scale) >> shift - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr: input1 >> shift + input0 * 1 - # a3: void *args - # a4: c_div_x_1 - # a5: input_shift - # t3: output_scale - # t4: output_shift - # s8: activation_alpha_ptr - # s9: activation_shift - - - lw a4, 64(a3) - lw a5, 88(a3) - lw t3, 96(a3) - lw t4, 92(a3) - lw s8, 56(a3) - lw s9, 60(a3) - - - li t0, 1 - beq t3, t0, dl_esp32p4_s8_rescale_add2d_output_prelu - - -dl_esp32p4_s8_rescale_add2d_output_scale_prelu: # *scale) >> shift - - sb t3, 0(sp) - add s11, sp, x0 - esp.vldbc.8.ip q7, s11, 0 # all output_scale - - add t0, a4, x0 - blez t0, 1f - 0: - esp.ldqa.s8.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - esp.srcmb.s8.qacc q1, a5, 1 - - esp.vld.128.ip q5, s8, 16 - dl_esp32p4_rescale_add_rescale_output q0, q1, q1, q7, t4 - - esp.vprelu.s8 q1, q1, q5, s9 - - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - esp.ldqa.s8.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - esp.srcmb.s8.qacc q1, a5, 1 - - esp.vld.128.ip q5, s8, 16 - dl_esp32p4_rescale_add_rescale_output q0, q1, q1, q7, t4 - - esp.vprelu.s8 q1, q1, q5, s9 - esp.vst.128.ip q1, a0, 16 - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -dl_esp32p4_s8_rescale_add2d_output_prelu: # >> shift - li s1, 1 - sb s1, 0(sp) - add s11, sp, x0 - esp.vldbc.8.ip q7, s11, 0 # all 1 - - esp.ldqa.s8.128.ip a2, 16 - esp.vld.128.ip q0, a1, 16 - add t0, a4, x0 - blez t0, 1f - 0: - esp.srcmb.s8.qacc q1, a5, 1 - esp.vmulas.s8.qacc.ld.ip q0, a1, 16, q0, q7 - - esp.vld.128.ip q6, s8, 16 - esp.srcmb.s8.qacc q1, t4, 1 - esp.vprelu.s8 q1, q1, q6, s9 - esp.ldqa.s8.128.ip a2, 16 - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - esp.srcmb.s8.qacc q1, a5, 1 - esp.vmulas.s8.qacc q0, q7 - - esp.vld.128.ip q6, s8, 16 - esp.srcmb.s8.qacc q1, t4, 1 - - esp.vprelu.s8 q1, q1, q6, s9 - esp.vst.128.ip q1, a0, 16 - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - - -############################################################################################################################################################ -#### -#### esp32p4_s8_unaligned_add2d_11c series -#### -############################################################################################################################################################ - - .align 2 - .text - .global dl_esp32p4_s8_unaligned_add2d_11c - .type dl_esp32p4_s8_unaligned_add2d_11c, @function - #.section .iram1 -dl_esp32p4_s8_unaligned_add2d_11c: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: input_shift - # t3: output_scale - # t4: output_shift - # t5: c_remainder - - lw a4, 64(a3) - lw t5, 76(a3) - lw a5, 88(a3) - - bgez a5, dl_esp32p4_s8_unaligned_rescale_add2d_11c - -# input0 exp = input1 exp = output exp - - esp.ld.128.usar.ip q5, a0, 0 #get output_ptr sar_byte - esp.movx.r.sar.bytes s1 - - bltz a4, dl_esp32p4_s8_unaligned_add2d_11c_small_remainder # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - beqz s1, dl_esp32p4_s8_unaligned_add2d_11c_0 - li t0, 8 - beq s1, t0, dl_esp32p4_s8_unaligned_add2d_11c_1 - - - add t0, a4, x0 - blez t0, 1f - 0: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - - esp.ld.128.usar.ip q1, a1, 16 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - j dl_esp32p4_s8_unaligned_add2d_11c_remainder - - #output sar = 0 - dl_esp32p4_s8_unaligned_add2d_11c_0: - add t0, a4, x0 - blez t0, 3f - 2: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - - esp.ld.128.usar.ip q1, a1, 16 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - esp.vst.128.ip q2, a0, 16 - j dl_esp32p4_s8_unaligned_add2d_11c_remainder - - # #output sar = 8 - dl_esp32p4_s8_unaligned_add2d_11c_1: - add t0, a4, x0 - blez t0, 5f - 4: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - - esp.ld.128.usar.ip q1, a1, 16 - esp32p4_s8_64b_unaligned_vector_store q2, a0 - addi t0, t0, -1 - bgtz t0, 4b - 5: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - esp32p4_s8_64b_unaligned_vector_store q2, a0 - j dl_esp32p4_s8_unaligned_add2d_11c_remainder - -dl_esp32p4_s8_unaligned_add2d_11c_small_remainder: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 - - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 - -dl_esp32p4_s8_unaligned_add2d_11c_remainder: - - beqz t5, dl_esp32p4_s8_unaligned_add2d_end - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - - dl_esp32p4_s8_store_remainder q2, t4, t6, s0, s1, t0, a0, t5 - - dl_esp32p4_s8_unaligned_add2d_end: - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -## rescaled add -dl_esp32p4_s8_unaligned_rescale_add2d_11c: - lw t3, 96(a3) # output_scale - lw t4, 92(a3) # output_shift - - li t0, 1 - beq t3, t0, dl_esp32p4_s8_rescale_unaligned_add2d_output_shift - - -### rescaled to output by *scale) >> shift -dl_esp32p4_s8_rescale_unaligned_add2d_output_scale: - - sb t3, 0(sp) - add s11, sp, x0 - esp.vldbc.8.ip q7, s11, 0 # all output_scale - - bltz a4, dl_esp32p4_s8_rescale_unaligned_add2d_scale_small_remainder # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - add t0, a4, x0 - blez t0, 7f - 6: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.mov.s8.qacc q5 - esp.srcmb.s8.qacc q1, a5, 1 - - dl_esp32p4_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp.ld.128.usar.ip q1, a1, 16 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s0 - addi t0, t0, -1 - bgtz t0, 6b - 7: - - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.mov.s8.qacc q5 - esp.srcmb.s8.qacc q1, a5, 1 - - dl_esp32p4_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - j dl_esp32p4_s8_rescale_unaligned_add2d_scale_remainder - - -dl_esp32p4_s8_rescale_unaligned_add2d_scale_small_remainder: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - -dl_esp32p4_s8_rescale_unaligned_add2d_scale_remainder: - beqz t5, dl_esp32p4_s8_unaligned_rescale_add2d_output_scale_end # c remainder - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.mov.s8.qacc q5 - esp.srcmb.s8.qacc q1, a5, 1 - - dl_esp32p4_rescale_add_rescale_output q2, q1, q2, q7, t4 - - # esp32p4_s8_32b_unaligned_vector_store q2, a0, s0 - dl_esp32p4_s8_store_remainder q2, t4, t6, s0, s1, t0, a0, t5 - - dl_esp32p4_s8_unaligned_rescale_add2d_output_scale_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -### rescaled to output by right shift -dl_esp32p4_s8_rescale_unaligned_add2d_output_shift: - li s1, 1 - sb s1, 0(sp) - add s11, sp, x0 - esp.vldbc.8.ip q7, s11, 0 # all 1 - - bltz a4, dl_esp32p4_s8_rescale_unaligned_add2d_shift_small_remainder # channel < 16 - - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - add t0, a4, x0 - blez t0, 9f - 8: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - esp.mov.s8.qacc q5 - - esp.srcmb.s8.qacc q5, a5, 1 - esp.vmulas.s8.qacc q2, q7 - esp.srcmb.s8.qacc q5, t4, 1 - - esp.ld.128.usar.ip q1, a1, 16 - esp32p4_s8_32b_unaligned_vector_store q5, a0, s1 - addi t0, t0, -1 - bgtz t0, 8b - 9: - addi a1, a1, -16 - add a1, a1, t5 - - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - esp.mov.s8.qacc q5 - - esp.srcmb.s8.qacc q5, a5, 1 - esp.vmulas.s8.qacc q2, q7 - esp.srcmb.s8.qacc q5, t4, 1 - - esp32p4_s8_32b_unaligned_vector_store q5, a0, s1 - j dl_esp32p4_s8_rescale_unaligned_add2d_shift_remainder - - - -dl_esp32p4_s8_rescale_unaligned_add2d_shift_small_remainder: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - -dl_esp32p4_s8_rescale_unaligned_add2d_shift_remainder: - beqz t5, dl_esp32p4_s8_unaligned_rescale_add2d_output_shift_end # c remainder - - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.mov.s8.qacc q5 - esp.srcmb.s8.qacc q5, a5, 1 - esp.vmulas.s8.qacc q2, q7 - esp.srcmb.s8.qacc q5, t4, 1 - - # esp32p4_s8_32b_unaligned_vector_store q5, a0, s1 - dl_esp32p4_s8_store_remainder q5, t4, t6, s0, s1, t0, a0, t5 - - dl_esp32p4_s8_unaligned_rescale_add2d_output_shift_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - - - - .align 2 - .text - .global dl_esp32p4_s8_unaligned_add2d_11c_relu - .type dl_esp32p4_s8_unaligned_add2d_11c_relu, @function - #.section .iram1 -dl_esp32p4_s8_unaligned_add2d_11c_relu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: input_shift - # t3: output_scale - # t4: output_shift - # t5: c_remainder - # s8: activation_alpha - # s9: activation_shift - - lw a4, 64(a3) - lw t5, 76(a3) - lw a5, 88(a3) - lw s8, 52(a3) - lw s9, 60(a3) - - bgez a5, dl_esp32p4_s8_unaligned_rescale_add2d_11c_relu - -# input0 exp = input1 exp = output exp - - esp.ld.128.usar.ip q5, a0, 0 #get output_ptr sar_byte - esp.movx.r.sar.bytes s1 - - bltz a4, dl_esp32p4_s8_unaligned_add2d_11c_relu_small_remainder # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - beqz s1, dl_esp32p4_s8_unaligned_add2d_11c_relu_0 - li t0, 8 - beq s1, t0, dl_esp32p4_s8_unaligned_add2d_11c_relu_1 - - - add t0, a4, x0 - blez t0, 1f - 0: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s8 q2, s8, s9 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - esp.vrelu.s8 q2, s8, s9 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - j dl_esp32p4_s8_unaligned_add2d_11c_relu_remainder - - #output sar = 0 - dl_esp32p4_s8_unaligned_add2d_11c_relu_0: - add t0, a4, x0 - blez t0, 3f - 2: - esp.src.q.qup q2, q0, q1 - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s8 q2, s8, s9 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - esp.vrelu.s8 q2, s8, s9 - esp.vst.128.ip q2, a0, 16 - j dl_esp32p4_s8_unaligned_add2d_11c_relu_remainder - - # #output sar = 8 - dl_esp32p4_s8_unaligned_add2d_11c_relu_1: - add t0, a4, x0 - blez t0, 5f - 4: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - esp.vadd.s8 q2, q2, q5 - - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s8 q2, s8, s9 - esp32p4_s8_64b_unaligned_vector_store q2, a0 - addi t0, t0, -1 - bgtz t0, 4b - 5: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - esp.vrelu.s8 q2, s8, s9 - esp32p4_s8_64b_unaligned_vector_store q2, a0 - j dl_esp32p4_s8_unaligned_add2d_11c_relu_remainder - -dl_esp32p4_s8_unaligned_add2d_11c_relu_small_remainder: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 - - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 - -dl_esp32p4_s8_unaligned_add2d_11c_relu_remainder: - - beqz t5, dl_esp32p4_s8_unaligned_add2d_relu_end - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - esp.vrelu.s8 q2, s8, s9 - - # esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - dl_esp32p4_s8_store_remainder q2, t4, t6, s0, s1, t0, a0, t5 - - dl_esp32p4_s8_unaligned_add2d_relu_end: - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -## rescaled add -dl_esp32p4_s8_unaligned_rescale_add2d_11c_relu: - lw t3, 96(a3) # output_scale - lw t4, 92(a3) # output_shift - - li t0, 1 - beq t3, t0, dl_esp32p4_s8_rescale_unaligned_add2d_output_shift_relu - - -### rescaled to output by *scale) >> shift -dl_esp32p4_s8_rescale_unaligned_add2d_output_scale_relu: - - sb t3, 0(sp) - add s11, sp, x0 - esp.vldbc.8.ip q7, s11, 0 # all output_scale - - bltz a4, dl_esp32p4_s8_rescale_unaligned_add2d_scale_relu_small_remainder # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - add t0, a4, x0 - blez t0, 7f - 6: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.mov.s8.qacc q5 - esp.srcmb.s8.qacc q1, a5, 1 - - dl_esp32p4_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s8 q2, s8, s9 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s0 - addi t0, t0, -1 - bgtz t0, 6b - 7: - - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.mov.s8.qacc q5 - esp.srcmb.s8.qacc q1, a5, 1 - - dl_esp32p4_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp.vrelu.s8 q2, s8, s9 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - j dl_esp32p4_s8_rescale_unaligned_add2d_scale_relu_remainder - - -dl_esp32p4_s8_rescale_unaligned_add2d_scale_relu_small_remainder: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - -dl_esp32p4_s8_rescale_unaligned_add2d_scale_relu_remainder: - beqz t5, dl_esp32p4_s8_unaligned_rescale_add2d_output_scale_relu_end # c remainder - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.mov.s8.qacc q5 - esp.srcmb.s8.qacc q1, a5, 1 - - dl_esp32p4_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp.vrelu.s8 q2, s8, s9 - # esp32p4_s8_32b_unaligned_vector_store q2, a0, s0 - dl_esp32p4_s8_store_remainder q2, t4, t6, s0, s1, t0, a0, t5 - - dl_esp32p4_s8_unaligned_rescale_add2d_output_scale_relu_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -### rescaled to output by right shift -dl_esp32p4_s8_rescale_unaligned_add2d_output_shift_relu: - li s1, 1 - sb s1, 0(sp) - add s11, sp, x0 - esp.vldbc.8.ip q7, s11, 0 # all 1 - - bltz a4, dl_esp32p4_s8_rescale_unaligned_add2d_shift_relu_small_remainder # channel < 16 - - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - add t0, a4, x0 - blez t0, 9f - 8: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - esp.mov.s8.qacc q5 - - esp.srcmb.s8.qacc q5, a5, 1 - esp.vmulas.s8.qacc q2, q7 - esp.srcmb.s8.qacc q5, t4, 1 - - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s8 q5, s8, s9 - esp32p4_s8_32b_unaligned_vector_store q5, a0, s1 - addi t0, t0, -1 - bgtz t0, 8b - 9: - addi a1, a1, -16 - add a1, a1, t5 - - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - esp.mov.s8.qacc q5 - - esp.srcmb.s8.qacc q5, a5, 1 - esp.vmulas.s8.qacc q2, q7 - esp.srcmb.s8.qacc q5, t4, 1 - esp.vrelu.s8 q5, s8, s9 - esp32p4_s8_32b_unaligned_vector_store q5, a0, s1 - j dl_esp32p4_s8_rescale_unaligned_add2d_shift_relu_remainder - - - -dl_esp32p4_s8_rescale_unaligned_add2d_shift_relu_small_remainder: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - -dl_esp32p4_s8_rescale_unaligned_add2d_shift_relu_remainder: - beqz t5, dl_esp32p4_s8_unaligned_rescale_add2d_output_shift_relu_end # c remainder - - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.mov.s8.qacc q5 - esp.srcmb.s8.qacc q5, a5, 1 - esp.vmulas.s8.qacc q2, q7 - esp.srcmb.s8.qacc q5, t4, 1 - esp.vrelu.s8 q5, s8, s9 - # esp32p4_s8_32b_unaligned_vector_store q5, a0, s1 - dl_esp32p4_s8_store_remainder q5, t4, t6, s0, s1, t0, a0, t5 - - dl_esp32p4_s8_unaligned_rescale_add2d_output_shift_relu_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - .align 2 - .text - .global dl_esp32p4_s8_unaligned_add2d_11c_prelu - .type dl_esp32p4_s8_unaligned_add2d_11c_prelu, @function - #.section .iram1 -dl_esp32p4_s8_unaligned_add2d_11c_prelu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: input_shift - # t3: output_scale - # t4: output_shift - # t5: c_remainder - # s8: activation_alpha_ptr - # s9: activation_shift - - lw a4, 64(a3) - lw t5, 76(a3) - lw a5, 88(a3) - lw s8, 56(a3) - lw s9, 60(a3) - - bgez a5, dl_esp32p4_s8_unaligned_rescale_add2d_11c_prelu - -# input0 exp = input1 exp = output exp - - esp.ld.128.usar.ip q5, a0, 0 #get output_ptr sar_byte - esp.movx.r.sar.bytes s1 - - bltz a4, dl_esp32p4_s8_unaligned_add2d_11c_prelu_small_remainder # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - beqz s1, dl_esp32p4_s8_unaligned_add2d_11c_prelu_0 - li t0, 8 - beq s1, t0, dl_esp32p4_s8_unaligned_add2d_11c_prelu_1 - - - add t0, a4, x0 - blez t0, 1f - 0: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s8 q2, q2, q6, s9 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vld.128.ip q6, s8, 16 - esp.vadd.s8 q2, q2, q5 - esp.vprelu.s8 q2, q2, q6, s9 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - j dl_esp32p4_s8_unaligned_add2d_11c_prelu_remainder - - #output sar = 0 - dl_esp32p4_s8_unaligned_add2d_11c_prelu_0: - add t0, a4, x0 - blez t0, 3f - 2: - esp.src.q.qup q2, q0, q1 - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vadd.s8 q2, q2, q5 - - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s8 q2, q2, q6, s9 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vld.128.ip q6, s8, 16 - esp.vadd.s8 q2, q2, q5 - esp.vprelu.s8 q2, q2, q6, s9 - esp.vst.128.ip q2, a0, 16 - j dl_esp32p4_s8_unaligned_add2d_11c_prelu_remainder - - # #output sar = 8 - dl_esp32p4_s8_unaligned_add2d_11c_prelu_1: - add t0, a4, x0 - blez t0, 5f - 4: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - esp.vadd.s8 q2, q2, q5 - - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s8 q2, q2, q6, s9 - esp32p4_s8_64b_unaligned_vector_store q2, a0 - addi t0, t0, -1 - bgtz t0, 4b - 5: - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 - esp.src.q.qup q5, q3, q4 - - esp.vld.128.ip q6, s8, 16 - esp.vadd.s8 q2, q2, q5 - esp.vprelu.s8 q2, q2, q6, s9 - esp32p4_s8_64b_unaligned_vector_store q2, a0 - j dl_esp32p4_s8_unaligned_add2d_11c_prelu_remainder - -dl_esp32p4_s8_unaligned_add2d_11c_prelu_small_remainder: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 - - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 - -dl_esp32p4_s8_unaligned_add2d_11c_prelu_remainder: - - beqz t5, dl_esp32p4_s8_unaligned_add2d_prelu_end - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.vld.128.ip q6, s8, 16 - esp.vadd.s8 q2, q2, q5 - esp.vprelu.s8 q2, q2, q6, s9 - # esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - dl_esp32p4_s8_store_remainder q2, t4, t6, s0, s1, t0, a0, t5 - - dl_esp32p4_s8_unaligned_add2d_prelu_end: - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -## rescaled add -dl_esp32p4_s8_unaligned_rescale_add2d_11c_prelu: - lw t3, 96(a3) # output_scale - lw t4, 92(a3) # output_shift - - li t0, 1 - beq t3, t0, dl_esp32p4_s8_rescale_unaligned_add2d_output_shift_prelu - - -### rescaled to output by *scale) >> shift -dl_esp32p4_s8_rescale_unaligned_add2d_output_scale_prelu: - - sb t3, 0(sp) - add s11, sp, x0 - esp.vldbc.8.ip q7, s11, 0 # all output_scale - - bltz a4, dl_esp32p4_s8_rescale_unaligned_add2d_scale_prelu_small_remainder # channel < 16 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - add t0, a4, x0 - blez t0, 7f - 6: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.mov.s8.qacc q5 - esp.srcmb.s8.qacc q1, a5, 1 - - dl_esp32p4_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s8 q2, q2, q6, s9 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s0 - addi t0, t0, -1 - bgtz t0, 6b - 7: - - addi a1, a1, -16 - add a1, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.mov.s8.qacc q5 - esp.srcmb.s8.qacc q1, a5, 1 - - esp.vld.128.ip q6, s8, 16 - dl_esp32p4_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp.vprelu.s8 q2, q2, q6, s9 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - j dl_esp32p4_s8_rescale_unaligned_add2d_scale_prelu_remainder - - -dl_esp32p4_s8_rescale_unaligned_add2d_scale_prelu_small_remainder: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - -dl_esp32p4_s8_rescale_unaligned_add2d_scale_prelu_remainder: - beqz t5, dl_esp32p4_s8_unaligned_rescale_add2d_output_scale_prelu_end # c remainder - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.mov.s8.qacc q5 - esp.srcmb.s8.qacc q1, a5, 1 - - esp.vld.128.ip q6, s8, 16 - dl_esp32p4_rescale_add_rescale_output q2, q1, q2, q7, t4 - - esp.vprelu.s8 q2, q2, q6, s9 - # esp32p4_s8_32b_unaligned_vector_store q2, a0, s0 - dl_esp32p4_s8_store_remainder q2, t4, t6, s0, s1, t0, a0, t5 - - dl_esp32p4_s8_unaligned_rescale_add2d_output_scale_prelu_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - -### rescaled to output by right shift -dl_esp32p4_s8_rescale_unaligned_add2d_output_shift_prelu: - li s1, 1 - sb s1, 0(sp) - add s11, sp, x0 - esp.vldbc.8.ip q7, s11, 0 # all 1 - - bltz a4, dl_esp32p4_s8_rescale_unaligned_add2d_shift_prelu_small_remainder # channel < 16 - - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - add t0, a4, x0 - blez t0, 9f - 8: - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - esp.mov.s8.qacc q5 - - esp.srcmb.s8.qacc q5, a5, 1 - esp.vmulas.s8.qacc q2, q7 - esp.srcmb.s8.qacc q5, t4, 1 - - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s8 q5, q5, q6, s9 - esp32p4_s8_32b_unaligned_vector_store q5, a0, s1 - addi t0, t0, -1 - bgtz t0, 8b - 9: - addi a1, a1, -16 - add a1, a1, t5 - - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - esp.mov.s8.qacc q5 - - esp.srcmb.s8.qacc q5, a5, 1 - esp.vmulas.s8.qacc q2, q7 - esp.vld.128.ip q6, s8, 16 - esp.srcmb.s8.qacc q5, t4, 1 - esp.vprelu.s8 q5, q5, q6, s9 - esp32p4_s8_32b_unaligned_vector_store q5, a0, s1 - j dl_esp32p4_s8_rescale_unaligned_add2d_shift_prelu_remainder - - - -dl_esp32p4_s8_rescale_unaligned_add2d_shift_prelu_small_remainder: - esp.ld.128.usar.xp q0, a1, t5 - esp.movx.r.sar.bytes t6 #input0 sar - esp.ld.128.usar.xp q3, a2, t5 - esp.movx.r.sar.bytes s0 #input1 sar - -dl_esp32p4_s8_rescale_unaligned_add2d_shift_prelu_remainder: - beqz t5, dl_esp32p4_s8_unaligned_rescale_add2d_output_shift_prelu_end # c remainder - - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.mov.s8.qacc q5 - esp.srcmb.s8.qacc q5, a5, 1 - esp.vmulas.s8.qacc q2, q7 - esp.vld.128.ip q6, s8, 16 - esp.srcmb.s8.qacc q5, t4, 1 - esp.vprelu.s8 q5, q5, q6, s9 - # esp32p4_s8_32b_unaligned_vector_store q5, a0, s1 - dl_esp32p4_s8_store_remainder q5, t4, t6, s0, s1, t0, a0, t5 - - dl_esp32p4_s8_unaligned_rescale_add2d_output_shift_prelu_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_avg_pool2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_avg_pool2d.S deleted file mode 100644 index af49ec00..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_avg_pool2d.S +++ /dev/null @@ -1,603 +0,0 @@ -############################################################################################################################################################ -#### -#### dl_esp32p4_s8_avg_pool2d series -#### -############################################################################################################################################################ - -#include "dl_esp32p4_s8.S" -#include "dl_esp32p4_common.S" - - - .align 2 - .text - .global dl_esp32p4_s8_avg_pool2d_22c1 - .type dl_esp32p4_s8_avg_pool2d_22c1, @function - #.section .iram1 -dl_esp32p4_s8_avg_pool2d_22c1: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - lw a3, 16(a2) # input_y_offset - lw a4, 20(a2) # input_x_offset - lw t5, 4(a2) # input_channel - lw t6, 104(a2) # c_div_x_1 - lw s1, 56(a2) # shift - - addi s8, a2, 64 - esp.vldbc.8.ip q0, s8, 0 # avg_pool_area_inv - - add a5, a1, a4 - add t3, a1, a3 - add t4, t3, a4 - - esp.vld.128.ip q1, a1, 16 - esp.vld.128.ip q2, a5, 16 - add t0, t6, x0 - blez t0, 1f - 0: - esp.zero.qacc - esp.vmulas.s8.qacc.ld.ip q3, t3, 16, q0, q1 - esp.vmulas.s8.qacc.ld.ip q4, t4, 16, q0, q2 - esp.vmulas.s8.qacc.ld.ip q1, a1, 16, q0, q3 - esp.vmulas.s8.qacc.ld.ip q2, a5, 16, q0, q4 - esp.srcmb.s8.qacc q7, s1, 1 - esp.vst.128.ip q7, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - esp.zero.qacc - esp.vmulas.s8.qacc.ld.ip q3, t3, 16, q0, q1 - esp.vmulas.s8.qacc.ld.ip q4, t4, 16, q0, q2 - esp.vmulas.s8.qacc.ld.ip q1, a1, 16, q0, q3 - esp.vmulas.s8.qacc.ld.ip q2, a5, 16, q0, q4 - esp.srcmb.s8.qacc q7, s1, 1 - - esp.vst.128.ip q7, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - - - .align 2 - .text - .global dl_esp32p4_s8_unaligned_avg_pool2d_22c1 - .type dl_esp32p4_s8_unaligned_avg_pool2d_22c1, @function - #.section .iram1 -dl_esp32p4_s8_unaligned_avg_pool2d_22c1: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - lw a3, 16(a2) # input_y_offset - lw a4, 20(a2) # input_x_offset - lw t5, 4(a2) # input_channel - lw t6, 104(a2) # c_div_x_1 - lw s0, 60(a2) # c_remainder - lw s1, 56(a2) # shift - - addi s8, a2, 64 - esp.vldbc.8.ip q6, s8, 0 # avg_pool_area_inv - - add a5, a1, a4 - add t3, a1, a3 - add t4, t3, a4 - - bltz t6, dl_esp32p4_s8_unaligned_avg_pool2d_22c1_remainder #channel < 16 - - esp.ld.128.usar.ip q7, a0, 0 #get output_ptr sar_byte - esp.movx.r.sar.bytes s9 - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q1, a1, 0 - - beqz s9, 2f - li t0, 8 - beq s9, t0, 3f - - add t0, t6, x0 - blez t0, 1f - 0: - esp.zero.qacc - esp.src.q.ld.ip q2, a5, 16, q0, q1 - - esp.ld.128.usar.ip q3, a5, 0 - esp.vmulas.s8.qacc q6, q0 - esp.src.q.ld.ip q4, t3, 16, q2, q3 - - esp.ld.128.usar.ip q5, t3, 0 - esp.vmulas.s8.qacc q6, q2 - esp.src.q.ld.ip q2, t4, 16, q4, q5 - - esp.ld.128.usar.ip q3, t4, 0 - esp.vmulas.s8.qacc q6, q4 - esp.src.q.ld.ip q0, a1, 16, q2, q3 - - esp.ld.128.usar.ip q1, a1, 0 - esp.vmulas.s8.qacc q6, q2 - esp.srcmb.s8.qacc q7, s1, 1 - esp32p4_s8_32b_unaligned_vector_store q7, a0, s8 - addi t0, t0, -1 - bgtz t0, 0b - 1: - j dl_esp32p4_s8_unaligned_avg_pool2d_22c1_loop_end - - -2: - add t0, t6, x0 - blez t0, 1f - 0: - esp.zero.qacc - esp.src.q.ld.ip q2, a5, 16, q0, q1 - - esp.ld.128.usar.ip q3, a5, 0 - esp.vmulas.s8.qacc q6, q0 - esp.src.q.ld.ip q4, t3, 16, q2, q3 - - esp.ld.128.usar.ip q5, t3, 0 - esp.vmulas.s8.qacc q6, q2 - esp.src.q.ld.ip q2, t4, 16, q4, q5 - - esp.ld.128.usar.ip q3, t4, 0 - esp.vmulas.s8.qacc q6, q4 - esp.src.q.ld.ip q0, a1, 16, q2, q3 - - esp.ld.128.usar.ip q1, a1, 0 - esp.vmulas.s8.qacc q6, q2 - esp.srcmb.s8.qacc q7, s1, 1 - esp.vst.128.ip q7, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - j dl_esp32p4_s8_unaligned_avg_pool2d_22c1_loop_end - -3: - add t0, t6, x0 - blez t0, 1f - 0: - esp.zero.qacc - esp.src.q.ld.ip q2, a5, 16, q0, q1 - - esp.ld.128.usar.ip q3, a5, 0 - esp.vmulas.s8.qacc q6, q0 - esp.src.q.ld.ip q4, t3, 16, q2, q3 - - esp.ld.128.usar.ip q5, t3, 0 - esp.vmulas.s8.qacc q6, q2 - esp.src.q.ld.ip q2, t4, 16, q4, q5 - - esp.ld.128.usar.ip q3, t4, 0 - esp.vmulas.s8.qacc q6, q4 - esp.src.q.ld.ip q0, a1, 16, q2, q3 - - esp.ld.128.usar.ip q1, a1, 0 - esp.vmulas.s8.qacc q6, q2 - esp.srcmb.s8.qacc q7, s1, 1 - esp32p4_s8_64b_unaligned_vector_store q7, a0 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - -dl_esp32p4_s8_unaligned_avg_pool2d_22c1_loop_end: - esp.zero.qacc - esp.src.q.ld.ip q2, a5, 16, q0, q1 - - esp.ld.128.usar.ip q3, a5, 0 - esp.vmulas.s8.qacc q6, q0 - esp.src.q.ld.ip q4, t3, 16, q2, q3 - - esp.ld.128.usar.ip q5, t3, 0 - esp.vmulas.s8.qacc q6, q2 - esp.src.q.ld.ip q2, t4, 16, q4, q5 - - esp.ld.128.usar.ip q3, t4, 0 - esp.vmulas.s8.qacc q6, q4 - esp.src.q q2, q2, q3 - esp.vmulas.s8.qacc q6, q2 - esp.srcmb.s8.qacc q7, s1, 1 - esp32p4_s8_32b_unaligned_vector_store q7, a0, s8 - - beqz s0, dl_esp32p4_s8_unaligned_avg_pool2d_22c1_end - -dl_esp32p4_s8_unaligned_avg_pool2d_22c1_remainder: - esp.ld.128.usar.xp q0, a1, s0 - esp.vld.128.ip q1, a1, 0 - esp.zero.qacc - esp.src.q q0, q0, q1 - - esp.ld.128.usar.xp q2, a5, s0 - esp.vld.128.ip q3, a5, 0 - esp.vmulas.s8.qacc q6, q0 - esp.src.q q2, q2, q3 - - esp.ld.128.usar.xp q4, t3, s0 - esp.vld.128.ip q5, t3, 0 - esp.vmulas.s8.qacc q6, q2 - esp.src.q q4, q4, q5 - - esp.ld.128.usar.xp q2, t4, s0 - esp.vld.128.ip q3, t4, 0 - esp.vmulas.s8.qacc q6, q4 - esp.src.q q2, q2, q3 - - esp.vmulas.s8.qacc q6, q2 - esp.srcmb.s8.qacc q7, s1, 1 - - dl_esp32p4_s8_store_remainder q7, t3, t4, t5, t6, t0, a0, s0 - -dl_esp32p4_s8_unaligned_avg_pool2d_22c1_end: - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - .align 2 - .text - .global dl_esp32p4_s8_avg_pool2d_hwc1 - .type dl_esp32p4_s8_avg_pool2d_hwc1, @function - #.section .iram1 -dl_esp32p4_s8_avg_pool2d_hwc1: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - lw a3, 16(a2) # input_y_offset - lw a4, 20(a2) # input_x_offset - lw a5, 4(a2) # input_channel - lw t3, 48(a2) # filter_height - lw t4, 52(a2) # filter_width - lw t6, 104(a2) # c_div_x_1 - lw s1, 56(a2) # shift - - addi s8, a2, 64 - esp.vldbc.8.ip q0, s8, 0 # avg_pool_area_inv - - srli t5, t4, 1 - addi t5, t5, -1 # filter_w / 2 - 1 - - li t0, 1 - beq t4, t0, dl_esp32p4_s8_avg_pool2d_h1c1 #filter_width == 1 - li t0, 1 - blt t6, t0, dl_esp32p4_s8_avg_pool2d_hw_small_channel - - 6: - add a5, a1, x0 - add s8, a5, x0 - add s9, t3, x0 - esp.zero.qacc - 5: - esp.vld.128.xp q1, s8, a4 - esp.vld.128.xp q2, s8, a4 - add t0, t5, x0 - blez t0, 1f - 0: - esp.vmulas.s8.qacc.ld.xp q1, s8, a4, q0, q1 - esp.vmulas.s8.qacc.ld.xp q2, s8, a4, q0, q2 - addi t0, t0, -1 - bgtz t0, 0b - 1: - andi t0, t4, 1 - beqz t0, 3f - 2:#three left - esp.vmulas.s8.qacc.ld.xp q1, s8, a4, q0, q1 - esp.vmulas.s8.qacc q0, q2 - esp.vmulas.s8.qacc q0, q1 - j 4f - - 3: # two left - esp.vmulas.s8.qacc q0, q1 - esp.vmulas.s8.qacc q0, q2 - 4: - addi s9, s9, -1 - add a5, a5, a3 - add s8, a5, x0 - bnez s9, 5b - - esp.srcmb.s8.qacc q7, s1, 1 - esp.vst.128.ip q7, a0, 16 - addi a1, a1, 16 - addi t6, t6, -1 - bnez t6, 6b - -dl_esp32p4_s8_avg_pool2d_hw_small_channel: - add a5, a1, x0 - add s8, a5, x0 - add s9, t3, x0 - esp.zero.qacc - 5: - esp.vld.128.xp q1, s8, a4 - esp.vld.128.xp q2, s8, a4 - add t0, t5, x0 - blez t0, 1f - 0: - esp.vmulas.s8.qacc.ld.xp q1, s8, a4, q0, q1 - esp.vmulas.s8.qacc.ld.xp q2, s8, a4, q0, q2 - addi t0, t0, -1 - bgtz t0, 0b - 1: - andi t0, t4, 1 - beqz t0, 2f - 2:#three left - esp.vmulas.s8.qacc.ld.xp q1, s8, a4, q0, q1 - esp.vmulas.s8.qacc q0, q2 - esp.vmulas.s8.qacc q0, q1 - j 4f - - 3: # two left - esp.vmulas.s8.qacc q0, q1 - esp.vmulas.s8.qacc q0, q2 - 4: - addi s9, s9, -1 - add a5, a5, a3 - add s8, a5, x0 - bnez s9, 5b - esp.srcmb.s8.qacc q7, s1, 1 - - esp.vst.128.ip q7, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - -dl_esp32p4_s8_avg_pool2d_h1c1: - addi t3, t3, -1 - li t0, 1 - blt t6, t0, dl_esp32p4_s8_max_pool2d_h1_small_channel - 2: - add s8, a1, x0 - esp.zero.qacc - esp.vld.128.xp q1, s8, a3 - add t0, t3, x0 - blez t0, 1f - 0: - esp.vmulas.s8.qacc.ld.xp q1, s8, a3, q0, q1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - esp.vmulas.s8.qacc q0, q1 - esp.srcmb.s8.qacc q7, s1, 1 - esp.vst.128.ip q7, a0, 16 - addi a1, a1, 16 - addi t6, t6, -1 - bnez t6, 2b - -dl_esp32p4_s8_max_pool2d_h1_small_channel: - add s8, a1, x0 - esp.zero.qacc - esp.vld.128.xp q1, s8, a3 - add t0, t3, x0 - blez t0, 1f - 0: - esp.vmulas.s8.qacc.ld.xp q1, s8, a3, q0, q1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - esp.vmulas.s8.qacc q0, q1 - esp.srcmb.s8.qacc q7, s1, 1 - - esp.vst.128.ip q7, a0, 16 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - - - .align 2 - .text - .global dl_esp32p4_s8_unaligned_avg_pool2d_hwc1 - .type dl_esp32p4_s8_unaligned_avg_pool2d_hwc1, @function - #.section .iram1 -dl_esp32p4_s8_unaligned_avg_pool2d_hwc1: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int16_t *output_ptr - # a1: int16_t *input_ptr - # a2: void *args - - lw a3, 16(a2) # input_y_offset - lw a4, 20(a2) # input_x_offset - lw a5, 4(a2) # input_channel - lw t3, 48(a2) # filter_height - lw t4, 52(a2) # filter_width - lw t6, 104(a2) # c_div_x_1 - lw s0, 60(a2) # c_remainder - lw s1, 56(a2) # shift - - addi s8, a2, 64 - esp.vldbc.8.ip q6, s8, 0 # avg_pool_area_inv - - srli t5, t4, 1 - addi t5, t5, -1 # filter_w / 2 - 1 - - addi a4, a4, -16 - - esp.ld.128.usar.ip q7, a0, 0 #get output_ptr sar_byte - esp.movx.r.sar.bytes s9 - - addi t6, t6, 1 - - li t0, 1 - beq t4, t0, dl_esp32p4_s8_unaligned_avg_pool2d_h1c1 #filter_width == 1 - li t0, 1 - blt t6, t0, dl_esp32p4_s8_unaligned_avg_pool2d_hw_small_channel - - - 9: - add a5, a1, x0 - add s8, a5, x0 - add s0, t3, x0 - esp.zero.qacc - 5: - esp.ld.128.usar.ip q0, s8, 16 - esp.ld.128.usar.xp q1, s8, a4 - add t0, t5, x0 - blez t0, 1f - 0: - esp.src.q.ld.ip q2, s8, 16, q0, q1 - esp.ld.128.usar.xp q1, s8, a4 - esp.vmulas.s8.qacc q6, q0 - - esp.src.q.ld.ip q0, s8, 16, q2, q1 - esp.ld.128.usar.xp q1, s8, a4 - esp.vmulas.s8.qacc q6, q2 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - andi t0, t4, 1 - beqz t0, 2f - 2:#three left - esp.src.q.ld.ip q2, s8, 16, q0, q1 - esp.ld.128.usar.xp q1, s8, a4 - esp.vmulas.s8.qacc q6, q0 - - esp.src.q.ld.ip q0, s8, 16, q2, q1 - esp.ld.128.usar.xp q1, s8, a4 - esp.vmulas.s8.qacc q6, q2 - - esp.src.q q0, q0, q1 - esp.vmulas.s8.qacc q6, q0 - - j 4f - - 3:# two left - esp.src.q.ld.ip q2, s8, 16, q0, q1 - esp.ld.128.usar.xp q1, s8, a4 - esp.vmulas.s8.qacc q6, q0 - - esp.src.q q2, q2, q1 - esp.vmulas.s8.qacc q6, q2 - - 4: - addi s0, s0, -1 - add a5, a5, a3 - add s8, a5, x0 - bnez s0, 5b - - esp.srcmb.s8.qacc q7, s1, 1 - - beqz s9, 6f - li t0, 8 - beq s9, t0, 7f - - esp32p4_s8_32b_unaligned_vector_store q7, a0, s8 - j 8f - - 6: - esp.vst.128.ip q7, a0, 16 - j 8f - 7: - esp32p4_s8_64b_unaligned_vector_store q7, a0 - - 8: - addi a1, a1, 16 - addi t6, t6, -1 - bnez t6, 9b - -dl_esp32p4_s8_unaligned_avg_pool2d_hw_small_channel: - lw s0, 60(a2) # c_remainder - beqz s0, dl_esp32p4_s8_unaligned_avg_pool2d_hw_small_channel_end - - add a5, a1, x0 - add s8, a5, x0 - add s9, t3, x0 - addi a4, a4, 16 - sub a4, a4, s0 - - esp.zero.qacc - 2: - add t0, t4, x0 - blez t0, 1f - 0: - esp.ld.128.usar.xp q0, s8, s0 - esp.vld.128.xp q1, s8, a4 - esp.src.q q0, q0, q1 - esp.vmulas.s8.qacc q6, q0 - addi t0, t0, -1 - bgtz t0, 0b - 1: - addi s9, s9, -1 - add a5, a5, a3 - add s8, a5, x0 - bnez s9, 2b - - esp.srcmb.s8.qacc q7, s1, 1 - dl_esp32p4_s8_store_remainder q7, t3, t4, t5, t6, t0, a0, s0 - -dl_esp32p4_s8_unaligned_avg_pool2d_hw_small_channel_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - -dl_esp32p4_s8_unaligned_avg_pool2d_h1c1: - addi a3, a3, -16 - li t0, 1 - blt t6, t0, dl_esp32p4_s8_unaligned_avg_pool2d_h1_remainder - - 5: - add s8, a1, x0 - esp.zero.qacc - add t0, t3, x0 - blez t0, 1f - 0: - esp.ld.128.usar.ip q0, s8, 16 - esp.vld.128.xp q1, s8, a3 - esp.src.q q0, q0, q1 - esp.vmulas.s8.qacc q6, q0 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - esp.srcmb.s8.qacc q7, s1, 1 - beqz s9, 2f - li t0, 8 - beq s9, t0, 2f - - esp32p4_s8_32b_unaligned_vector_store q7, a0, t4 - j 3f - 2: - esp.vst.128.ip q7, a0, 16 - j 4f - 3: - esp32p4_s8_64b_unaligned_vector_store q7, a0 - - 4: - addi a1, a1, 16 - addi t6, t6, -1 - bnez t6, 5b - -dl_esp32p4_s8_unaligned_avg_pool2d_h1_remainder: - beqz s0, dl_esp32p4_s8_unaligned_avg_pool2d_hwc1_end - - add s8, a1, x0 - addi a3, a3, 16 - sub a3, a3, s0 - esp.zero.qacc - add t0, t3, x0 - blez t0, 1f - 0: - esp.ld.128.usar.xp q0, s8, s0 - esp.vld.128.xp q1, s8, a3 - esp.src.q q0, q0, q1 - esp.vmulas.s8.qacc q6, q0 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - esp.srcmb.s8.qacc q7, s1, 1 - dl_esp32p4_s8_store_remainder q7, t3, t4, t5, t6, t0, a0, s0 - -dl_esp32p4_s8_unaligned_avg_pool2d_hwc1_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_conv2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_conv2d.S deleted file mode 100644 index 879cbf8a..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_conv2d.S +++ /dev/null @@ -1,1179 +0,0 @@ -#include "dl_esp32p4_s8.S" - -############################################################################################################################################################ -#### -#### esp32p4_s8_conv2d_11cn series -#### -############################################################################################################################################################ -.macro esp32p4_s8_conv2d_11c16 input_v0 input_ptr filter_v0 filter_v1 filter_ptr c_div_x_1 - # scalar * vecter and accumulate into QACC - # input_ptr += (c_div_x_1 + 1) * 16 in the end - # filter_ptr point to the next 16 bytes in the end - - # input_v0: 16 input elements - # filter_v0: 16 filter elements - # filter_v1: 16 filter elements - # input_ptr: input_ptr - # filter_ptr: filter_ptr - # c_div_x_1: input_channel // 16 - 1 - - esp.vld.128.ip \input_v0, \input_ptr, 16 - esp.vld.128.ip \filter_v0, \filter_ptr, 16 - esp.vld.128.ip \filter_v1, \filter_ptr, 16 - beqz \c_div_x_1, 1f - - # lp.setup 0, \c_div_x_1, 0f - esp.lp.setup 0, \c_div_x_1, 0f - # 0: - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 11 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 12 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 13 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 14 - esp.vsmulas.s8.qacc.ld.incp \input_v0, \input_ptr, \filter_v1, \input_v0, 15 - 0: esp.vld.128.ip \filter_v1, \filter_ptr, 16 - # addi \c_div_x_1, \c_div_x_1, -1 - # bgtz \c_div_x_1, 0b - - 1: - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 11 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 12 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 13 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 14 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 15 -.endm - - - -############################################################################################################################################################ -#### -#### esp32p4_s8_conv2d_11cn -#### -############################################################################################################################################################ -.macro esp32p4_s8_conv2d_11cn_load_args args filter_ptr c_div_x_1 n_rs3 mac_shift - lw \n_rs3, 96(\args) // output_channel_div_8 - lw \mac_shift, 64(\args) // mac_shift - lw \filter_ptr, 48(\args) // filter - lw \c_div_x_1, 100(\args) // input_channel / x - 1 -.endm - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_11cn_bias - .type dl_esp32p4_s8_conv2d_11cn_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_11cn_bias: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: bias_ptr - # t5: moving_input_ptr - # t6: - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s8_conv2d_11cn_load_args a2, a3, a4, a5, t3 - # Because the subsequent esp.lp.setup loop instruction compares for a value >= 0 and cannot be negative, we subtract 1 in advance here. - # addi a4, a4, -1 - lw t4, 68(a2) // bias - - esp32p4_s8_conv2d_11cn_bias_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t4 - esp32p4_s8_conv2d_11c16 q0, t5, q1, q2, a3, a4 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_11cn_bias_loop - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_11cn_bias_relu - .type dl_esp32p4_s8_conv2d_11cn_bias_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_11cn_bias_relu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: bias_ptr - # t5: moving_input_ptr - # t6: - - # s0: activation_alpha/_address - # s1: activation_shift - # s8: - # s9: - # s10: - # s11: - - addi sp, sp, -8 - sw s0, 4(sp) - sw s1, 0(sp) - - esp32p4_s8_conv2d_11cn_load_args a2, a3, a4, a5, t3 - lw t4, 68(a2) // bias - lw s0, 76(a2) // activation_alpha - lw s1, 84(a2) // activation_shift - - esp32p4_s8_conv2d_11cn_bias_relu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t4 - esp32p4_s8_conv2d_11c16 q0, t5, q1, q2, a3, a4 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp32p4_s8_128b_vector_relu q0, s0, s1 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_11cn_bias_relu_loop - - lw s0, 4(sp) // restore s0 - lw s1, 0(sp) // restore s1 - addi sp, sp, 8 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_11cn_bias_prelu - .type dl_esp32p4_s8_conv2d_11cn_bias_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_11cn_bias_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: bias_ptr - # t5: moving_input_ptr - # t6: - - # s0: activation_alpha/_address - # s1: activation_shift - # s8: - # s9: - # s10: - # s11: - - addi sp, sp, -8 - sw s0, 4(sp) - sw s1, 0(sp) - - esp32p4_s8_conv2d_11cn_load_args a2, a3, a4, a5, t3 - lw t4, 68(a2) // bias - lw s0, 80(a2) // activation_alpha_ptr - lw s1, 84(a2) // activation_shift - - esp32p4_s8_conv2d_11cn_bias_prelu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t4 - esp32p4_s8_conv2d_11c16 q0, t5, q1, q2, a3, a4 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp32p4_s8_128b_vector_prelu q0, q1, s0, s1 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_11cn_bias_prelu_loop - - lw s0, 4(sp) // restore s0 - lw s1, 0(sp) // restore s1 - addi sp, sp, 8 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_11cn - .type dl_esp32p4_s8_conv2d_11cn, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_11cn: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: - # t5: moving_input_ptr - # t6: - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s8_conv2d_11cn_load_args a2, a3, a4, a5, t3 - # Because the subsequent esp.lp.setup loop instruction compares for a value >= 0 and cannot be negative, we subtract 1 in advance here. - # addi a4, a4, -1 - - esp32p4_s8_conv2d_11cn_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_11c16 q0, t5, q1, q2, a3, a4 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_11cn_loop - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_11cn_relu - .type dl_esp32p4_s8_conv2d_11cn_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_11cn_relu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: - # t5: moving_input_ptr - # t6: - - # s0: activation_alpha/_address - # s1: activation_shift - # s8: - # s9: - # s10: - # s11: - - addi sp, sp, -8 - sw s0, 4(sp) - sw s1, 0(sp) - - esp32p4_s8_conv2d_11cn_load_args a2, a3, a4, a5, t3 - lw s0, 76(a2) // activation_alpha - lw s1, 84(a2) // activation_shift - - esp32p4_s8_conv2d_11cn_relu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_11c16 q0, t5, q1, q2, a3, a4 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp32p4_s8_128b_vector_relu q0, s0, s1 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_11cn_relu_loop - - lw s0, 4(sp) // restore s0 - lw s1, 0(sp) // restore s1 - addi sp, sp, 8 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_11cn_prelu - .type dl_esp32p4_s8_conv2d_11cn_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_11cn_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: - # t5: moving_input_ptr - # t6: - - # s0: activation_alpha/_address - # s1: activation_shift - # s8: - # s9: - # s10: - # s11: - - addi sp, sp, -8 - sw s0, 4(sp) - sw s1, 0(sp) - - esp32p4_s8_conv2d_11cn_load_args a2, a3, a4, a5, t3 - lw s0, 80(a2) // activation_alpha_ptr - lw s1, 84(a2) // activation_shift - - esp32p4_s8_conv2d_11cn_prelu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_11c16 q0, t5, q1, q2, a3, a4 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp32p4_s8_128b_vector_prelu q0, q1, s0, s1 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_11cn_prelu_loop - - lw s0, 4(sp) // restore s0 - lw s1, 0(sp) // restore s1 - addi sp, sp, 8 - ret - - - - - - -############################################################################################################################################################ -#### -#### esp32p4_s8_conv2d_33cn series -#### -############################################################################################################################################################ -.macro esp32p4_s8_conv2d_33c16 input_v0 filter_v0 filter_v1 input_ptr filter_ptr c_div_x_1 dilation_x_offset dilation_y_offset - # dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(output_t) - # dilation_y_offset = (dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1)) * sizeof(output_t) - - esp32p4_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_y_offset - - esp32p4_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_y_offset - - esp32p4_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - # add \input_ptr, \input_ptr, \dilation_y_offset -.endm - - - -.macro esp32p4_s8_conv2d_hwcn_load_args args filter_ptr c_div_x_1 n_rs3 mac_shift dilation_x_offset dilation_y_offset - esp32p4_s8_conv2d_11cn_load_args \args, \filter_ptr, \c_div_x_1, \n_rs3, \mac_shift - lw \dilation_x_offset, 108(\args) // input dilation x offset - lw \dilation_y_offset, 112(\args) // input dilation y offset -.endm - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_33cn_bias - .type dl_esp32p4_s8_conv2d_33cn_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_33cn_bias: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: bias_ptr - # t5: moving_input_ptr - # t6: - - # s0: input dilation x offset - # s1: input dilation y offset - # s8: - # s9: - # s10: - # s11: - - addi sp, sp, -8 - sw s0, 4(sp) - sw s1, 0(sp) - - esp32p4_s8_conv2d_hwcn_load_args a2, a3, a4, a5, t3, s0, s1 - lw t4, 68(a2) // bias - - esp32p4_s8_conv2d_33cn_bias_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t4 - esp32p4_s8_conv2d_33c16 q0, q1, q2, t5, a3, a4, s0, s1 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_33cn_bias_loop - - lw s0, 4(sp) // restore s0 - lw s1, 0(sp) // restore s1 - addi sp, sp, 8 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_33cn_bias_relu - .type dl_esp32p4_s8_conv2d_33cn_bias_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_33cn_bias_relu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: bias_ptr - # t5: moving_input_ptr - # t6: - - # s0: input dilation x offset - # s1: input dilation y offset - # s8: activation_alpha/_address - # s9: activation_shift - # s10: - # s11: - - addi sp, sp, -16 - sw s0, 12(sp) - sw s1, 8(sp) - sw s8, 4(sp) - sw s9, 0(sp) - - esp32p4_s8_conv2d_hwcn_load_args a2, a3, a4, a5, t3, s0, s1 - lw t4, 68(a2) // bias - lw s8, 76(a2) // activation_alpha - lw s9, 84(a2) // activation_shift - - esp32p4_s8_conv2d_33cn_bias_relu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t4 - esp32p4_s8_conv2d_33c16 q0, q1, q2, t5, a3, a4, s0, s1 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp32p4_s8_128b_vector_relu q0, s8, s9 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_33cn_bias_relu_loop - - lw s0, 12(sp) // restore s0 - lw s1, 8(sp) // restore s1 - lw s8, 4(sp) // restore s8 - lw s9, 0(sp) // restore s9 - addi sp, sp, 16 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_33cn_bias_prelu - .type dl_esp32p4_s8_conv2d_33cn_bias_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_33cn_bias_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: bias_ptr - # t5: moving_input_ptr - # t6: - - # s0: input dilation x offset - # s1: input dilation y offset - # s8: activation_alpha/_address - # s9: activation_shift - # s10: - # s11: - - addi sp, sp, -16 - sw s0, 12(sp) - sw s1, 8(sp) - sw s8, 4(sp) - sw s9, 0(sp) - - esp32p4_s8_conv2d_hwcn_load_args a2, a3, a4, a5, t3, s0, s1 - lw t4, 68(a2) // bias - lw s8, 80(a2) // activation_alpha_ptr - lw s9, 84(a2) // activation_shift - - esp32p4_s8_conv2d_33cn_bias_prelu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t4 - esp32p4_s8_conv2d_33c16 q0, q1, q2, t5, a3, a4, s0, s1 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp32p4_s8_128b_vector_prelu q0, q1, s8, s9 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_33cn_bias_prelu_loop - - lw s0, 12(sp) // restore s0 - lw s1, 8(sp) // restore s1 - lw s8, 4(sp) // restore s8 - lw s9, 0(sp) // restore s9 - addi sp, sp, 16 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_33cn - .type dl_esp32p4_s8_conv2d_33cn, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_33cn: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: - # t5: moving_input_ptr - # t6: - - # s0: input dilation x offset - # s1: input dilation y offset - # s8: - # s9: - # s10: - # s11: - - addi sp, sp, -8 - sw s0, 4(sp) - sw s1, 0(sp) - - esp32p4_s8_conv2d_hwcn_load_args a2, a3, a4, a5, t3, s0, s1 - - esp32p4_s8_conv2d_33cn_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_33c16 q0, q1, q2, t5, a3, a4, s0, s1 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_33cn_loop - - lw s0, 4(sp) // restore s0 - lw s1, 0(sp) // restore s1 - addi sp, sp, 8 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_33cn_relu - .type dl_esp32p4_s8_conv2d_33cn_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_33cn_relu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: - # t5: moving_input_ptr - # t6: - - # s0: input dilation x offset - # s1: input dilation y offset - # s8: activation_alpha/_address - # s9: activation_shift - # s10: - # s11: - - addi sp, sp, -16 - sw s0, 12(sp) - sw s1, 8(sp) - sw s8, 4(sp) - sw s9, 0(sp) - - esp32p4_s8_conv2d_hwcn_load_args a2, a3, a4, a5, t3, s0, s1 - lw s8, 76(a2) // activation_alpha - lw s9, 84(a2) // activation_shift - - esp32p4_s8_conv2d_33cn_relu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_33c16 q0, q1, q2, t5, a3, a4, s0, s1 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp32p4_s8_128b_vector_relu q0, s8, s9 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_33cn_relu_loop - - lw s0, 12(sp) // restore s0 - lw s1, 8(sp) // restore s1 - lw s8, 4(sp) // restore s8 - lw s9, 0(sp) // restore s9 - addi sp, sp, 16 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_33cn_prelu - .type dl_esp32p4_s8_conv2d_33cn_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_33cn_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: - # t5: moving_input_ptr - # t6: - - # s0: input dilation x offset - # s1: input dilation y offset - # s8: activation_alpha/_address - # s9: activation_shift - # s10: - # s11: - - addi sp, sp, -16 - sw s0, 12(sp) - sw s1, 8(sp) - sw s8, 4(sp) - sw s9, 0(sp) - - esp32p4_s8_conv2d_hwcn_load_args a2, a3, a4, a5, t3, s0, s1 - lw s8, 80(a2) // activation_alpha_ptr - lw s9, 84(a2) // activation_shift - - esp32p4_s8_conv2d_33cn_prelu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_33c16 q0, q1, q2, t5, a3, a4, s0, s1 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp32p4_s8_128b_vector_prelu q0, q1, s8, s9 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_33cn_prelu_loop - - lw s0, 12(sp) // restore s0 - lw s1, 8(sp) // restore s1 - lw s8, 4(sp) // restore s8 - lw s9, 0(sp) // restore s9 - addi sp, sp, 16 - ret - - - - -############################################################################################################################################################ -#### -#### esp32p4_s8_conv2d_hwcn series -#### -############################################################################################################################################################ -.macro esp32p4_s8_conv2d_hwc16 input_v0 filter_v0 filter_v1 input_ptr filter_ptr c_div_x_1 dilation_x_offset dilation_y_offset filter_h filter_w args filter_offset_q - # dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(output_t) - # dilation_y_offset = (dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1)) * sizeof(output_t) - # filter_h - # filter_w - - lw \filter_h, 52(\args) # filter_height - 2: - lw \filter_w, 56(\args) # filter_width - addi \filter_w, \filter_w, -1 - beqz \filter_w, 4f - // lp.setup 1, \filter_w, 3f - // esp32p4_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - // 3: add \input_ptr, \input_ptr, \dilation_x_offset - 3: - esp32p4_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - addi \filter_w, \filter_w, -1 - bgtz \filter_w, 3b - 4: - esp32p4_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - esp.movi.32.a \filter_offset_q, \filter_w, 1 - add \filter_ptr, \filter_ptr, \filter_w - add \input_ptr, \input_ptr, \dilation_y_offset - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 2b - - esp.movi.32.a \filter_offset_q, \filter_h, 2 - add \filter_ptr, \filter_ptr, \filter_h -.endm - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_hwcn_bias - .type dl_esp32p4_s8_conv2d_hwcn_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_hwcn_bias: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: bias_ptr - # t5: moving_input_ptr - # t6: - - # s0: input dilation x offset - # s1: input dilation y offset - # s8: filter_height - # s9: filter_width - # s10: - # s11: - - addi sp, sp, -16 - sw s0, 12(sp) - sw s1, 8(sp) - sw s8, 4(sp) - sw s9, 0(sp) - - esp32p4_s8_conv2d_hwcn_load_args a2, a3, a4, a5, t3, s0, s1 - lw t4, 68(a2) // bias - lw s9, 60(a2) // filter_y_offset - lw s8, 144(a2) - esp.movi.32.q q6, s9, 1 // filter_y_offset - esp.movi.32.q q6, s8, 2 // filter_n_offset - - esp32p4_s8_conv2d_hwcn_bias_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t4 - esp32p4_s8_conv2d_hwc16 q0, q1, q2, t5, a3, a4, s0, s1, s8, s9, a2, q6 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_hwcn_bias_loop - - lw s0, 12(sp) // restore s0 - lw s1, 8(sp) // restore s1 - lw s8, 4(sp) // restore s8 - lw s9, 0(sp) // restore s9 - addi sp, sp, 16 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_hwcn_bias_relu - .type dl_esp32p4_s8_conv2d_hwcn_bias_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_hwcn_bias_relu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: bias_ptr - # t5: moving_input_ptr - # t6: activation_alpha/_address - - # s0: input dilation x offset - # s1: input dilation y offset - # s8: filter_height - # s9: filter_width - # s10: activation_shift - # s11: - - addi sp, sp, -20 - sw s0, 16(sp) - sw s1, 12(sp) - sw s8, 8(sp) - sw s9, 4(sp) - sw s10, 0(sp) - - esp32p4_s8_conv2d_hwcn_load_args a2, a3, a4, a5, t3, s0, s1 - lw t4, 68(a2) // bias - lw t6, 76(a2) // activation_alpha - lw s10, 84(a2) // activation_shift - lw s9, 60(a2) // filter_y_offset - lw s8, 144(a2) - esp.movi.32.q q6, s9, 1 // filter_y_offset - esp.movi.32.q q6, s8, 2 // filter_n_offset - - esp32p4_s8_conv2d_hwcn_bias_relu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t4 - esp32p4_s8_conv2d_hwc16 q0, q1, q2, t5, a3, a4, s0, s1, s8, s9, a2, q6 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp32p4_s8_128b_vector_relu q0, t6, s10 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_hwcn_bias_relu_loop - - lw s0, 16(sp) // restore s0 - lw s1, 12(sp) // restore s1 - lw s8, 8(sp) // restore s8 - lw s9, 4(sp) // restore s9 - lw s10, 0(sp) // restore s10 - addi sp, sp, 20 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_hwcn_bias_prelu - .type dl_esp32p4_s8_conv2d_hwcn_bias_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_hwcn_bias_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: bias_ptr - # t5: moving_input_ptr - # t6: activation_alpha/_address - - # s0: input dilation x offset - # s1: input dilation y offset - # s8: filter_height - # s9: filter_width - # s10: activation_shift - # s11: - - addi sp, sp, -20 - sw s0, 16(sp) - sw s1, 12(sp) - sw s8, 8(sp) - sw s9, 4(sp) - sw s10, 0(sp) - - esp32p4_s8_conv2d_hwcn_load_args a2, a3, a4, a5, t3, s0, s1 - lw t4, 68(a2) // bias - lw t6, 80(a2) // activation_alpha_ptr - lw s10, 84(a2) // activation_shift - lw s9, 60(a2) // filter_y_offset - lw s8, 144(a2) - esp.movi.32.q q6, s9, 1 // filter_y_offset - esp.movi.32.q q6, s8, 2 // filter_n_offset - - esp32p4_s8_conv2d_hwcn_bias_prelu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t4 - esp32p4_s8_conv2d_hwc16 q0, q1, q2, t5, a3, a4, s0, s1, s8, s9, a2, q6 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp32p4_s8_128b_vector_prelu q0, q1, t6, s10 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_hwcn_bias_prelu_loop - - lw s0, 16(sp) // restore s0 - lw s1, 12(sp) // restore s1 - lw s8, 8(sp) // restore s8 - lw s9, 4(sp) // restore s9 - lw s10, 0(sp) // restore s10 - addi sp, sp, 20 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_hwcn - .type dl_esp32p4_s8_conv2d_hwcn, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_hwcn: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: - # t5: moving_input_ptr - # t6: - - # s0: input dilation x offset - # s1: input dilation y offset - # s8: filter_height - # s9: filter_width - # s10: - # s11: - - addi sp, sp, -16 - sw s0, 12(sp) - sw s1, 8(sp) - sw s8, 4(sp) - sw s9, 0(sp) - - esp32p4_s8_conv2d_hwcn_load_args a2, a3, a4, a5, t3, s0, s1 - lw s9, 60(a2) // filter_y_offset - lw s8, 144(a2) - esp.movi.32.q q6, s9, 1 // filter_y_offset - esp.movi.32.q q6, s8, 2 // filter_n_offset - - esp32p4_s8_conv2d_hwcn_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_hwc16 q0, q1, q2, t5, a3, a4, s0, s1, s8, s9, a2, q6 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_hwcn_loop - - lw s0, 12(sp) // restore s0 - lw s1, 8(sp) // restore s1 - lw s8, 4(sp) // restore s8 - lw s9, 0(sp) // restore s9 - addi sp, sp, 16 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_hwcn_relu - .type dl_esp32p4_s8_conv2d_hwcn_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_hwcn_relu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: - # t5: moving_input_ptr - # t6: activation_alpha/_address - - # s0: input dilation x offset - # s1: input dilation y offset - # s8: filter_height - # s9: filter_width - # s10: activation_shift - # s11: - - addi sp, sp, -20 - sw s0, 16(sp) - sw s1, 12(sp) - sw s8, 8(sp) - sw s9, 4(sp) - sw s10, 0(sp) - - esp32p4_s8_conv2d_hwcn_load_args a2, a3, a4, a5, t3, s0, s1 - lw t6, 76(a2) // activation_alpha - lw s10, 84(a2) // activation_shift - lw s9, 60(a2) // filter_y_offset - lw s8, 144(a2) - esp.movi.32.q q6, s9, 1 // filter_y_offset - esp.movi.32.q q6, s8, 2 // filter_n_offset - - esp32p4_s8_conv2d_hwcn_relu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_hwc16 q0, q1, q2, t5, a3, a4, s0, s1, s8, s9, a2, q6 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp32p4_s8_128b_vector_relu q0, t6, s10 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_hwcn_relu_loop - - lw s0, 16(sp) // restore s0 - lw s1, 12(sp) // restore s1 - lw s8, 8(sp) // restore s8 - lw s9, 4(sp) // restore s9 - lw s10, 0(sp) // restore s10 - addi sp, sp, 20 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_conv2d_hwcn_prelu - .type dl_esp32p4_s8_conv2d_hwcn_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_conv2d_hwcn_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_div_x_1 - # a5: n_rs3 - # t3: mac_shift - # t4: - # t5: moving_input_ptr - # t6: activation_alpha/_address - - # s0: input dilation x offset - # s1: input dilation y offset - # s8: filter_height - # s9: filter_width - # s10: activation_shift - # s11: - - addi sp, sp, -20 - sw s0, 16(sp) - sw s1, 12(sp) - sw s8, 8(sp) - sw s9, 4(sp) - sw s10, 0(sp) - - esp32p4_s8_conv2d_hwcn_load_args a2, a3, a4, a5, t3, s0, s1 - lw t6, 80(a2) // activation_alpha_ptr - lw s10, 84(a2) // activation_shift - lw s9, 60(a2) // filter_y_offset - lw s8, 144(a2) - esp.movi.32.q q6, s9, 1 // filter_y_offset - esp.movi.32.q q6, s8, 2 // filter_n_offset - - esp32p4_s8_conv2d_hwcn_prelu_loop: - mv t5, a1 // reload input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_hwc16 q0, q1, q2, t5, a3, a4, s0, s1, s8, s9, a2, q6 - esp32p4_s8_128b_vector_shift_result q0, t3 - esp32p4_s8_128b_vector_prelu q0, q1, t6, s10 - esp.vst.128.ip q0, a0, 16 - addi a5, a5, -1 - bnez a5, esp32p4_s8_conv2d_hwcn_prelu_loop - - lw s0, 16(sp) // restore s0 - lw s1, 12(sp) // restore s1 - lw s8, 8(sp) // restore s8 - lw s9, 4(sp) // restore s9 - lw s10, 0(sp) // restore s10 - addi sp, sp, 20 - ret \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_depthwise_conv2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_depthwise_conv2d.S deleted file mode 100644 index 6f09c034..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_depthwise_conv2d.S +++ /dev/null @@ -1,973 +0,0 @@ -#include "dl_esp32p4_s8.S" - -############################################################################################################################################################ -#### -#### esp32p4_s8_depthwise_conv2d_33c1 series -#### -############################################################################################################################################################ -.macro esp32p4_s8_depthwise_conv2d_33s1 input_v0 filter_v0 input_v1 filter_v1 input_v2 filter_v2 input_ptr filter_ptr dilation_x_offset dilation_y_offset next_33s1 - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - # next_33s1 = (-(filter_width - 1) * dilation_x * input_channel_with_padding - (filter_height - 1) * dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) + 16 - - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.vld.128.xp \input_v2, \input_ptr, \dilation_y_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - esp.vld.128.xp \input_v0, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - esp.vld.128.xp \input_v1, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.vld.128.xp \input_v2, \input_ptr, \dilation_y_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - esp.vld.128.xp \input_v0, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - esp.vld.128.xp \input_v1, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.vld.128.xp \input_v2, \input_ptr, \next_33s1 - - esp.vmulas.s8.qacc.ld.ip \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - esp.vld.128.xp \input_v0, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - esp.vld.128.xp \input_v1, \input_ptr, \dilation_x_offset -.endm - - - -.macro esp32p4_s8_depthwise_conv2d_33s1_last input_v0 filter_v0 input_v1 filter_v1 input_ptr filter_ptr dilation_x_offset dilation_y_offset - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.vld.128.xp \input_v0, \input_ptr, \dilation_y_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - esp.vld.128.xp \input_v1, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.vld.128.xp \input_v0, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - esp.vld.128.xp \input_v1, \input_ptr, \dilation_y_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.vld.128.xp \input_v0, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - esp.vld.128.xp \input_v1, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.vld.128.ip \input_v0, \input_ptr, 0 - - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - # block one cycle here - - esp.vmulas.s8.qacc \input_v0, \filter_v0 -.endm - - - -.macro esp32p4_s8_depthwise_conv2d_33c1_load_args args filter_ptr dilation_x_offset dilation_y_offset next_hwx1 c_div_x_1 mac_shift - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - # next_hwx1 = (-(filter_width - 1) * dilation_x * input_channel_with_padding - (filter_height - 1) * dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) + 16 - - lw \filter_ptr, 48(\args) - lw \dilation_x_offset, 124(\args) - lw \dilation_y_offset, 128(\args) - lw \next_hwx1, 132(\args) - lw \c_div_x_1, 100(\args) - lw \mac_shift, 64 (\args) -.endm - - - - .text - .align 2 - .global dl_esp32p4_s8_depthwise_conv2d_33c1_bias - .type dl_esp32p4_s8_depthwise_conv2d_33c1_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s8_depthwise_conv2d_33c1_bias: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: input dilation x offset - # a5: input dilation y offset - # t3: next_33s1 - # t4: c_div_x_1 - # t5: mac_shift - # t6: bias_ptr - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s8_depthwise_conv2d_33c1_load_args a2, a3, a4, a5, t3, t4, t5 - lw t6, 68(a2) // bias - - esp.vld.128.xp q0, a1, a4 - esp.vld.128.ip q1, a3, 16 - esp.vld.128.xp q2, a1, a4 - - beqz t4, esp32p4_s8_depthwise_conv2d_33c1_bias_loop_last - - # lp.setup 0, t4, 1f - esp.lp.setup 0, t4, 1f - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t5 - 1: esp.vst.128.ip q3, a0, 16 - - esp32p4_s8_depthwise_conv2d_33c1_bias_loop_last: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a1, a3, a4, a5 - esp32p4_s8_128b_vector_shift_result q3, t5 - esp.vst.128.ip q3, a0, 16 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_depthwise_conv2d_33c1_bias_relu - .type dl_esp32p4_s8_depthwise_conv2d_33c1_bias_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_depthwise_conv2d_33c1_bias_relu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: input dilation x offset - # a5: input dilation y offset - # t3: next_33s1 - # t4: c_div_x_1 - # t5: mac_shift - # t6: bias_ptr - - # s0: activation_alpha/_address - # s1: activation_shift - # s8: - # s9: - # s10: - # s11: - - addi sp, sp, -8 - sw s0, 4(sp) - sw s1, 0(sp) - - esp32p4_s8_depthwise_conv2d_33c1_load_args a2, a3, a4, a5, t3, t4, t5 - lw t6, 68(a2) // bias - lw s0, 76(a2) // activation_alpha - lw s1, 84(a2) // activation_shift - - esp.vld.128.xp q0, a1, a4 - esp.vld.128.ip q1, a3, 16 - esp.vld.128.xp q2, a1, a4 - - beqz t4, esp32p4_s8_depthwise_conv2d_33c1_bias_relu_loop_last - - # lp.setup 0, t4, 1f - esp.lp.setup 0, t4, 1f - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t5 - esp32p4_s8_128b_vector_relu q3, s0, s1 - 1: esp.vst.128.ip q3, a0, 16 - - esp32p4_s8_depthwise_conv2d_33c1_bias_relu_loop_last: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a1, a3, a4, a5 - esp32p4_s8_128b_vector_shift_result q3, t5 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp.vst.128.ip q3, a0, 16 - - lw s0, 4(sp) // restore s0 - lw s1, 0(sp) // restore s1 - addi sp, sp, 8 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_depthwise_conv2d_33c1_bias_prelu - .type dl_esp32p4_s8_depthwise_conv2d_33c1_bias_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_depthwise_conv2d_33c1_bias_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: input dilation x offset - # a5: input dilation y offset - # t3: next_33s1 - # t4: c_div_x_1 - # t5: mac_shift - # t6: bias_ptr - - # s0: activation_alpha/_address - # s1: activation_shift - # s8: - # s9: - # s10: - # s11: - - addi sp, sp, -8 - sw s0, 4(sp) - sw s1, 0(sp) - - esp32p4_s8_depthwise_conv2d_33c1_load_args a2, a3, a4, a5, t3, t4, t5 - lw t6, 68(a2) // bias - lw s0, 80(a2) // activation_alpha_ptr - lw s1, 84(a2) // activation_shift - - esp.vld.128.xp q0, a1, a4 - esp.vld.128.ip q1, a3, 16 - esp.vld.128.xp q2, a1, a4 - - beqz t4, esp32p4_s8_depthwise_conv2d_33c1_bias_prelu_loop_last - - # lp.setup 0, t4, 1f - esp.lp.setup 0, t4, 1f - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t5 - esp32p4_s8_128b_vector_prelu q3, q4, s0, s1 - 1: esp.vst.128.ip q3, a0, 16 - - esp32p4_s8_depthwise_conv2d_33c1_bias_prelu_loop_last: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a1, a3, a4, a5 - esp32p4_s8_128b_vector_shift_result q3, t5 - esp32p4_s8_128b_vector_prelu q3, q4, s0, s1 - esp.vst.128.ip q3, a0, 16 - - lw s0, 4(sp) // restore s0 - lw s1, 0(sp) // restore s1 - addi sp, sp, 8 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_depthwise_conv2d_33c1 - .type dl_esp32p4_s8_depthwise_conv2d_33c1, @function - .balign 4 - .option norvc -dl_esp32p4_s8_depthwise_conv2d_33c1: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: input dilation x offset - # a5: input dilation y offset - # t3: next_33s1 - # t4: c_div_x_1 - # t5: mac_shift - # t6: - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s8_depthwise_conv2d_33c1_load_args a2, a3, a4, a5, t3, t4, t5 - - esp.vld.128.xp q0, a1, a4 - esp.vld.128.ip q1, a3, 16 - esp.vld.128.xp q2, a1, a4 - - beqz t4, esp32p4_s8_depthwise_conv2d_33c1_loop_last - - # lp.setup 0, t4, 1f - esp.lp.setup 0, t4, 1f - esp.zero.qacc - esp32p4_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t5 - 1: esp.vst.128.ip q3, a0, 16 - - esp32p4_s8_depthwise_conv2d_33c1_loop_last: - esp.zero.qacc - esp32p4_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a1, a3, a4, a5 - esp32p4_s8_128b_vector_shift_result q3, t5 - esp.vst.128.ip q3, a0, 16 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_depthwise_conv2d_33c1_relu - .type dl_esp32p4_s8_depthwise_conv2d_33c1_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_depthwise_conv2d_33c1_relu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: input dilation x offset - # a5: input dilation y offset - # t3: next_33s1 - # t4: c_div_x_1 - # t5: mac_shift - # t6: - - # s0: activation_alpha/_address - # s1: activation_shift - # s8: - # s9: - # s10: - # s11: - - addi sp, sp, -8 - sw s0, 4(sp) - sw s1, 0(sp) - - esp32p4_s8_depthwise_conv2d_33c1_load_args a2, a3, a4, a5, t3, t4, t5 - lw s0, 76(a2) // activation_alpha - lw s1, 84(a2) // activation_shift - - esp.vld.128.xp q0, a1, a4 - esp.vld.128.ip q1, a3, 16 - esp.vld.128.xp q2, a1, a4 - - beqz t4, esp32p4_s8_depthwise_conv2d_33c1_relu_loop_last - - # lp.setup 0, t4, 1f - esp.lp.setup 0, t4, 1f - esp.zero.qacc - esp32p4_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t5 - esp32p4_s8_128b_vector_relu q3, s0, s1 - 1: esp.vst.128.ip q3, a0, 16 - - esp32p4_s8_depthwise_conv2d_33c1_relu_loop_last: - esp.zero.qacc - esp32p4_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a1, a3, a4, a5 - esp32p4_s8_128b_vector_shift_result q3, t5 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp.vst.128.ip q3, a0, 16 - - lw s0, 4(sp) // restore s0 - lw s1, 0(sp) // restore s1 - addi sp, sp, 8 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_depthwise_conv2d_33c1_prelu - .type dl_esp32p4_s8_depthwise_conv2d_33c1_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_depthwise_conv2d_33c1_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: input dilation x offset - # a5: input dilation y offset - # t3: next_33s1 - # t4: c_div_x_1 - # t5: mac_shift - # t6: - - # s0: activation_alpha/_address - # s1: activation_shift - # s8: - # s9: - # s10: - # s11: - - addi sp, sp, -8 - sw s0, 4(sp) - sw s1, 0(sp) - - esp32p4_s8_depthwise_conv2d_33c1_load_args a2, a3, a4, a5, t3, t4, t5 - lw s0, 80(a2) // activation_alpha_ptr - lw s1, 84(a2) // activation_shift - - esp.vld.128.xp q0, a1, a4 - esp.vld.128.ip q1, a3, 16 - esp.vld.128.xp q2, a1, a4 - - beqz t4, esp32p4_s8_depthwise_conv2d_33c1_prelu_loop_last - - # lp.setup 0, t4, 1f - esp.lp.setup 0, t4, 1f - esp.zero.qacc - esp32p4_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t5 - esp32p4_s8_128b_vector_prelu q3, q4, s0, s1 - 1: esp.vst.128.ip q3, a0, 16 - - esp32p4_s8_depthwise_conv2d_33c1_prelu_loop_last: - esp.zero.qacc - esp32p4_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a1, a3, a4, a5 - esp32p4_s8_128b_vector_shift_result q3, t5 - esp32p4_s8_128b_vector_prelu q3, q4, s0, s1 - esp.vst.128.ip q3, a0, 16 - - lw s0, 4(sp) // restore s0 - lw s1, 0(sp) // restore s1 - addi sp, sp, 8 - ret - - - -############################################################################################################################################################ -#### -#### esp32p4_s8_depthwise_conv2d_hwc1 series -#### -############################################################################################################################################################ - -.macro esp32p4_s8_depthwise_conv2d_1ws1 input_v0, input_v1, input_v2, filter_v0, filter_v1, filter_v2, input_ptr, filter_ptr, dilation_x_offset, dilation_y_offset, tmp_value, filter_w, filter_w_rs1_1, filter_y_offset - beqz \filter_w_rs1_1, 1f - # lp.setup 0, \filter_w_rs1_1, 0f - esp.lp.setup 0, \filter_w_rs1_1, 0f - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.vld.128.xp \input_v0, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - 0: esp.vld.128.xp \input_v1, \input_ptr, \dilation_x_offset - - 1: - andi \tmp_value, \filter_w, 0xFFFFFFFE - beq \tmp_value, \filter_w, 2f - # three 8-input-element left - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.vld.128.xp \input_v2, \input_ptr, \dilation_y_offset - - esp.vmulas.s8.qacc.ld.xp \filter_v2, \filter_ptr, \filter_y_offset, \input_v1, \filter_v1 - esp.vld.128.xp \input_v0, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - esp.vld.128.xp \input_v1, \input_ptr, \dilation_x_offset - j 3f - 2: # two 8-input-element left - esp.vmulas.s8.qacc.ld.xp \filter_v1, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - sub \input_ptr, \input_ptr, \dilation_x_offset - add \input_ptr, \input_ptr, \dilation_y_offset - esp.vld.128.xp \input_v0, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - esp.vld.128.xp \input_v1, \input_ptr, \dilation_x_offset - 3: -.endm - - - -.macro esp32p4_s8_depthwise_conv2d_1ws1_last input_v0, input_v1, filter_v0, filter_v1, input_ptr, filter_ptr, dilation_x_offset, dilation_y_offset, tmp_value, filter_w, filter_w_rs1_1, next_hws1, filter_y_offset - beqz \filter_w_rs1_1, 5f - # lp.setup 0, \filter_w_rs1_1, 4f - esp.lp.setup 0, \filter_w_rs1_1, 4f - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.vld.128.xp \input_v0, \input_ptr, \dilation_x_offset - - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - 4: esp.vld.128.xp \input_v1, \input_ptr, \dilation_x_offset - - 5: - andi \tmp_value, \filter_w, 0xFFFFFFFE - beq \tmp_value, \filter_w, 6f - # three 8-input-element left - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.vld.128.xp \input_v0, \input_ptr, \next_hws1 - - esp.vmulas.s8.qacc.ld.xp \filter_v0, \filter_ptr, \filter_y_offset, \input_v1, \filter_v1 - # block one cyle here - esp.vmulas.s8.qacc \input_v0, \filter_v0 - j 7f - 6: # two 8-input-element left - esp.vmulas.s8.qacc.ld.xp \filter_v1, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - sub \input_ptr, \input_ptr, \dilation_x_offset - add \input_ptr, \input_ptr, \next_hws1 - esp.vmulas.s8.qacc \input_v1, \filter_v1 - 7: -.endm - - - -.macro esp32p4_s8_depthwise_conv2d_hws1 input_v0, input_v1, input_v2, filter_v0, filter_v1, filter_v2, input_ptr, filter_ptr, dilation_x_offset, dilation_y_offset, next_hws1, filter_h, filter_w, filter_w_rs1_1, args, filter_offset_q, filter_y_offset, tmp_value - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - # next_hws1 = (-(filter_width - 1) * dilation_x * input_channel_with_padding - (filter_height - 1) * dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) + 16 - # filter_w_rs1_1 - - lw \filter_h, 52(\args) # filter_height - - addi \tmp_value, \filter_w, -1 - beqz \tmp_value, 10f - esp.vld.128.ip \filter_v0, \filter_ptr, 16 - esp.vld.128.xp \input_v0, \input_ptr, \dilation_x_offset - esp.vld.128.xp \input_v1, \input_ptr, \dilation_x_offset - - addi \filter_h, \filter_h, -1 - beqz \filter_h, 9f - // lp.setup 1, \filter_h, 8f - // 8: esp32p4_s8_depthwise_conv2d_1ws1 \input_v0, \input_v1, \input_v2, \filter_v0, \filter_v1, \filter_v2, \input_ptr, \filter_ptr, \dilation_x_offset, \dilation_y_offset, \tmp_value, \filter_w, \filter_w_rs1_1, \filter_y_offset - 8: - esp32p4_s8_depthwise_conv2d_1ws1 \input_v0, \input_v1, \input_v2, \filter_v0, \filter_v1, \filter_v2, \input_ptr, \filter_ptr, \dilation_x_offset, \dilation_y_offset, \tmp_value, \filter_w, \filter_w_rs1_1, \filter_y_offset - addi \filter_h, \filter_h, -1 - bgtz \filter_h, 8b - 9: # last y - esp32p4_s8_depthwise_conv2d_1ws1_last \input_v0, \input_v1, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \dilation_x_offset, \dilation_y_offset, \tmp_value, \filter_w, \filter_w_rs1_1, \next_hws1, \filter_y_offset - j 13f - - 10: # filter_w == 1 - esp.vld.128.xp \filter_v0, \filter_ptr, \filter_y_offset - esp.vld.128.xp \input_v0, \input_ptr, \dilation_y_offset - addi \filter_h, \filter_h, -1 - beqz \filter_h, 12f - // lp.setup 1, \filter_h, 11f - // esp.vmulas.s8.qacc.ld.xp \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - // 11: esp.vld.128.xp \input_v0, \input_ptr, \dilation_y_offset - 11: - esp.vmulas.s8.qacc.ld.xp \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - esp.vld.128.xp \input_v0, \input_ptr, \dilation_y_offset - addi \filter_h, \filter_h, -1 - bgtz \filter_h, 11b - 12: # last y - esp.vmulas.s8.qacc \input_v0, \filter_v0 - sub \input_ptr, \input_ptr, \dilation_y_offset - add \input_ptr, \input_ptr, \next_hws1 - - 13: - esp.movi.32.a \filter_offset_q, \filter_h, 2 - add \filter_ptr, \filter_ptr, \filter_h -.endm - - - -.macro esp32p4_s8_depthwise_conv2d_hwc1_load_args args, filter_ptr, dilation_x_offset, dilation_y_offset, next_hws1, c_div_x_1, mac_shift, filter_w, filter_w_rs1_1 - esp32p4_s8_depthwise_conv2d_33c1_load_args \args, \filter_ptr, \dilation_x_offset, \dilation_y_offset, \next_hws1, \c_div_x_1, \mac_shift - lw \filter_w, 56(\args) - lw \filter_w_rs1_1, 148(\args) -.endm - - - - .text - .align 2 - .global dl_esp32p4_s8_depthwise_conv2d_hwc1_bias - .type dl_esp32p4_s8_depthwise_conv2d_hwc1_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s8_depthwise_conv2d_hwc1_bias: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: input dilation x offset - # a5: input dilation y offset - # t3: next_hws1 - # t4: c_div_x_1 - # t5: mac_shift - # t6: bias_ptr - - # s0: filter_h / filter_n_offset - # s1: filter_w - # s8: filter_w_rs1_1 - # s9: filter_y_offset - # s10: - # s11: tmp_value - - addi sp, sp, -20 - sw s0, 16(sp) - sw s1, 12(sp) - sw s8, 8(sp) - sw s9, 4(sp) - sw s11, 0(sp) - - lw s9, 60(a2) // filter_y_offset - lw s0, 144(a2) // filter_n_offset - esp.movi.32.q q7, s0, 2 - - lw t6, 68(a2) // bias - esp32p4_s8_depthwise_conv2d_hwc1_load_args a2, a3, a4, a5, t3, t4, t5, s1, s8 - - esp32p4_s8_depthwise_conv2d_hwc1_bias_loop: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3, s0, s1, s8, a2, q7, s9, s11 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp.vst.128.ip q0, a0, 16 - addi t4, t4, -1 - bgez t4, esp32p4_s8_depthwise_conv2d_hwc1_bias_loop - - lw s0, 16(sp) // restore s0 - lw s1, 12(sp) // restore s1 - lw s8, 8(sp) // restore s8 - lw s9, 4(sp) // restore s9 - lw s11, 0(sp) // restore s11 - addi sp, sp, 20 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_depthwise_conv2d_hwc1_bias_relu - .type dl_esp32p4_s8_depthwise_conv2d_hwc1_bias_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_depthwise_conv2d_hwc1_bias_relu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: input dilation x offset - # a5: input dilation y offset - # t3: next_hws1 - # t4: c_div_x_1 - # t5: mac_shift - # t6: bias_ptr - - # s0: filter_h / filter_n_offset - # s1: filter_w - # s8: filter_w_rs1_1 - # s9: filter_y_offset / activation_alpha - # s10: activation_shift - # s11: tmp_value - - addi sp, sp, -24 - sw s0, 20(sp) - sw s1, 16(sp) - sw s8, 12(sp) - sw s9, 8(sp) - sw s10, 4(sp) - sw s11, 0(sp) - - lw s9, 76(a2) // activation_alpha - lw s10, 84(a2) // activation_shift - esp.movi.32.q q7, s9, 3 - - lw s9, 60(a2) // filter_y_offset - lw s0, 144(a2) // filter_n_offset - esp.movi.32.q q7, s9, 1 - esp.movi.32.q q7, s0, 2 - - lw t6, 68(a2) // bias - esp32p4_s8_depthwise_conv2d_hwc1_load_args a2, a3, a4, a5, t3, t4, t5, s1, s8 - - esp32p4_s8_depthwise_conv2d_hwc1_bias_relu_loop: - esp.zero.qacc - esp.movi.32.a q7, s9, 1 // filter_y_offset - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3, s0, s1, s8, a2, q7, s9, s11 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp.movi.32.a q7, s9, 3 // activation_alpha - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp.vst.128.ip q0, a0, 16 - - addi t4, t4, -1 - bgez t4, esp32p4_s8_depthwise_conv2d_hwc1_bias_relu_loop - - lw s0, 20(sp) // restore s0 - lw s1, 16(sp) // restore s1 - lw s8, 12(sp) // restore s8 - lw s9, 8(sp) // restore s9 - lw s10, 4(sp) // restore s10 - lw s11, 0(sp) // restore s11 - addi sp, sp, 24 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_depthwise_conv2d_hwc1_bias_prelu - .type dl_esp32p4_s8_depthwise_conv2d_hwc1_bias_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_depthwise_conv2d_hwc1_bias_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: input dilation x offset - # a5: input dilation y offset - # t3: next_hws1 - # t4: c_div_x_1 - # t5: mac_shift - # t6: bias_ptr - - # s0: filter_h / filter_n_offset - # s1: filter_w - # s8: filter_w_rs1_1 - # s9: filter_y_offset / activation_alpha - # s10: activation_shift - # s11: tmp_value - - addi sp, sp, -24 - sw s0, 20(sp) - sw s1, 16(sp) - sw s8, 12(sp) - sw s9, 8(sp) - sw s10, 4(sp) - sw s11, 0(sp) - - lw s9, 80(a2) // activation_alpha_ptr - lw s10, 84(a2) // activation_shift - esp.movi.32.q q7, s9, 3 - - lw s9, 60(a2) // filter_y_offset - lw s0, 144(a2) // filter_n_offset - esp.movi.32.q q7, s9, 1 - esp.movi.32.q q7, s0, 2 - - lw t6, 68(a2) // bias - esp32p4_s8_depthwise_conv2d_hwc1_load_args a2, a3, a4, a5, t3, t4, t5, s1, s8 - - esp32p4_s8_depthwise_conv2d_hwc1_bias_prelu_loop: - esp.zero.qacc - esp.movi.32.a q7, s9, 1 // filter_y_offset - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3, s0, s1, s8, a2, q7, s9, s11 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp.movi.32.a q7, s9, 3 // activation_alpha_ptr - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp.vst.128.ip q0, a0, 16 - - addi t4, t4, -1 - bgez t4, esp32p4_s8_depthwise_conv2d_hwc1_bias_prelu_loop - - lw s0, 20(sp) // restore s0 - lw s1, 16(sp) // restore s1 - lw s8, 12(sp) // restore s8 - lw s9, 8(sp) // restore s9 - lw s10, 4(sp) // restore s10 - lw s11, 0(sp) // restore s11 - addi sp, sp, 24 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_depthwise_conv2d_hwc1 - .type dl_esp32p4_s8_depthwise_conv2d_hwc1, @function - .balign 4 - .option norvc -dl_esp32p4_s8_depthwise_conv2d_hwc1: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: input dilation x offset - # a5: input dilation y offset - # t3: next_hws1 - # t4: c_div_x_1 - # t5: mac_shift - # t6: - - # s0: filter_h / filter_n_offset - # s1: filter_w - # s8: filter_w_rs1_1 - # s9: filter_y_offset - # s10: - # s11: tmp_value - - addi sp, sp, -20 - sw s0, 16(sp) - sw s1, 12(sp) - sw s8, 8(sp) - sw s9, 4(sp) - sw s11, 0(sp) - - lw s9, 60(a2) // filter_y_offset - lw s0, 144(a2) // filter_n_offset - esp.movi.32.q q7, s0, 2 - esp32p4_s8_depthwise_conv2d_hwc1_load_args a2, a3, a4, a5, t3, t4, t5, s1, s8 - - esp32p4_s8_depthwise_conv2d_hwc1_loop: - esp.zero.qacc - esp32p4_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3, s0, s1, s8, a2, q7, s9, s11 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp.vst.128.ip q0, a0, 16 - addi t4, t4, -1 - bgez t4, esp32p4_s8_depthwise_conv2d_hwc1_loop - - lw s0, 16(sp) // restore s0 - lw s1, 12(sp) // restore s1 - lw s8, 8(sp) // restore s8 - lw s9, 4(sp) // restore s9 - lw s11, 0(sp) // restore s11 - addi sp, sp, 20 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_depthwise_conv2d_hwc1_relu - .type dl_esp32p4_s8_depthwise_conv2d_hwc1_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_depthwise_conv2d_hwc1_relu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: input dilation x offset - # a5: input dilation y offset - # t3: next_hws1 - # t4: c_div_x_1 - # t5: mac_shift - # t6: - - # s0: filter_h / filter_n_offset - # s1: filter_w - # s8: filter_w_rs1_1 - # s9: filter_y_offset / activation_alpha - # s10: activation_shift - # s11: tmp_value - - addi sp, sp, -24 - sw s0, 20(sp) - sw s1, 16(sp) - sw s8, 12(sp) - sw s9, 8(sp) - sw s10, 4(sp) - sw s11, 0(sp) - - lw s9, 76(a2) // activation_alpha - lw s10, 84(a2) // activation_shift - esp.movi.32.q q7, s9, 3 - - lw s9, 60(a2) // filter_y_offset - lw s0, 144(a2) // filter_n_offset - esp.movi.32.q q7, s9, 1 - esp.movi.32.q q7, s0, 2 - esp32p4_s8_depthwise_conv2d_hwc1_load_args a2, a3, a4, a5, t3, t4, t5, s1, s8 - - esp32p4_s8_depthwise_conv2d_hwc1_relu_loop: - esp.zero.qacc - esp.movi.32.a q7, s9, 1 // filter_y_offset - esp32p4_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3, s0, s1, s8, a2, q7, s9, s11 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp.movi.32.a q7, s9, 3 // activation_alpha - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp.vst.128.ip q0, a0, 16 - - addi t4, t4, -1 - bgez t4, esp32p4_s8_depthwise_conv2d_hwc1_relu_loop - - lw s0, 20(sp) // restore s0 - lw s1, 16(sp) // restore s1 - lw s8, 12(sp) // restore s8 - lw s9, 8(sp) // restore s9 - lw s10, 4(sp) // restore s10 - lw s11, 0(sp) // restore s11 - addi sp, sp, 24 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_depthwise_conv2d_hwc1_prelu - .type dl_esp32p4_s8_depthwise_conv2d_hwc1_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_depthwise_conv2d_hwc1_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: input dilation x offset - # a5: input dilation y offset - # t3: next_hws1 - # t4: c_div_x_1 - # t5: mac_shift - # t6: - - # s0: filter_h / filter_n_offset - # s1: filter_w - # s8: filter_w_rs1_1 - # s9: filter_y_offset / activation_alpha - # s10: activation_shift - # s11: tmp_value - - addi sp, sp, -24 - sw s0, 20(sp) - sw s1, 16(sp) - sw s8, 12(sp) - sw s9, 8(sp) - sw s10, 4(sp) - sw s11, 0(sp) - - lw s9, 80(a2) // activation_alpha_ptr - lw s10, 84(a2) // activation_shift - esp.movi.32.q q7, s9, 3 - - lw s9, 60(a2) // filter_y_offset - lw s0, 144(a2) // filter_n_offset - esp.movi.32.q q7, s9, 1 - esp.movi.32.q q7, s0, 2 - esp32p4_s8_depthwise_conv2d_hwc1_load_args a2, a3, a4, a5, t3, t4, t5, s1, s8 - - esp32p4_s8_depthwise_conv2d_hwc1_prelu_loop: - esp.zero.qacc - esp.movi.32.a q7, s9, 1 // filter_y_offset - esp32p4_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3, s0, s1, s8, a2, q7, s9, s11 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp.movi.32.a q7, s9, 3 // activation_alpha_ptr - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp.vst.128.ip q0, a0, 16 - - addi t4, t4, -1 - bgez t4, esp32p4_s8_depthwise_conv2d_hwc1_prelu_loop - - lw s0, 20(sp) // restore s0 - lw s1, 16(sp) // restore s1 - lw s8, 12(sp) // restore s8 - lw s9, 8(sp) // restore s9 - lw s10, 4(sp) // restore s10 - lw s11, 0(sp) // restore s11 - addi sp, sp, 24 - ret diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_mul2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_mul2d.S deleted file mode 100644 index 55a33f94..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_mul2d.S +++ /dev/null @@ -1,714 +0,0 @@ -#include "dl_esp32p4_s8.S" -#include "dl_esp32p4_common.S" - - -############################################################################################################################################################ -#### -#### esp32p4_s8_mul2d_11c series -#### -############################################################################################################################################################ - - .align 2 - .text - .global dl_esp32p4_s8_mul2d_11c - .type dl_esp32p4_s8_mul2d_11c, @function - #.section .iram1 -dl_esp32p4_s8_mul2d_11c: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: mul_shift - - lw a4, 64(a3) - lw a5, 100(a3) - lw t3, 76(a3) - - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - add t0, a4, x0 - blez t0, 1f - 0: - esp.zero.qacc - esp.vmulas.s8.qacc.ld.ip q0, a1, 16, q0, q1 - esp.vld.128.ip q1, a2, 16 - esp.srcmb.s8.qacc q2, a5, 1 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - esp.zero.qacc - esp.vmulas.s8.qacc q0, q1 - esp.srcmb.s8.qacc q2, a5, 1 - - esp.vst.128.ip q2, a0, 16 - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - .align 2 - .text - .global dl_esp32p4_s8_mul2d_11c_relu - .type dl_esp32p4_s8_mul2d_11c_relu, @function - #.section .iram1 -dl_esp32p4_s8_mul2d_11c_relu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: mul_shift - # s8: activation_alpha - # s9: activation_shift - - - lw a4, 64(a3) - lw a5, 100(a3) - lw t3, 76(a3) - lw s8, 52(a3) - lw s9, 60(a3) - - - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - add t0, a4, x0 - blez t0, 1f - 0: - esp.zero.qacc - esp.vmulas.s8.qacc.ld.ip q0, a1, 16, q0, q1 - esp.vld.128.ip q1, a2, 16 - esp.srcmb.s8.qacc q2, a5, 1 - esp.vrelu.s8 q2, s8, s9 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - esp.zero.qacc - esp.vmulas.s8.qacc q0, q1 - esp.srcmb.s8.qacc q2, a5, 1 - - esp.vrelu.s8 q2, s8, s9 - esp.vst.128.ip q2, a0, 16 - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - .align 2 - .text - .global dl_esp32p4_s8_mul2d_11c_prelu - .type dl_esp32p4_s8_mul2d_11c_prelu, @function - #.section .iram1 -dl_esp32p4_s8_mul2d_11c_prelu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: mul_shift - # s8: activation_alpha_ptr - # s9: activation_shift - - - lw a4, 64(a3) - lw a5, 100(a3) - lw s8, 56(a3) - lw s9, 60(a3) - - - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a2, 16 - add t0, a4, x0 - blez t0, 1f - 0: - esp.zero.qacc - esp.vmulas.s8.qacc.ld.ip q0, a1, 16, q0, q1 - esp.vld.128.ip q1, a2, 16 - - esp.vld.128.ip q3, s8, 16 - esp.srcmb.s8.qacc q2, a5, 1 - esp.vprelu.s8 q2, q2, q3, s9 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - esp.zero.qacc - esp.vmulas.s8.qacc q0, q1 - esp.vld.128.ip q3, s8, 16 - esp.srcmb.s8.qacc q2, a5, 1 - - esp.vprelu.s8 q2, q2, q3, s9 - esp.vst.128.ip q2, a0, 16 - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - - -############################################################################################################################################################ -#### -#### esp32p4_s8_unaligned_mul2d_11c series -#### -############################################################################################################################################################ - - .align 2 - .text - .global dl_esp32p4_s8_unaligned_mul2d_11c - .type dl_esp32p4_s8_unaligned_mul2d_11c, @function - #.section .iram1 -dl_esp32p4_s8_unaligned_mul2d_11c: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: c_remainder - # t3: mul_shift - - - lw a4, 64(a3) - lw a5, 76(a3) - lw t3, 100(a3) - - - - esp.ld.128.usar.ip q5, a0, 0 #get output_ptr sar_byte - esp.movx.r.sar.bytes s1 - - bltz a4, dl_tie718_s8_unaligned_mul2d_11c_small_remainder # channel < 16 - - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - beqz s1, dl_tie718_s8_unaligned_mul2d_11c_0 - li t0, 8 - beq s1, t0, dl_tie718_s8_unaligned_mul2d_11c_1 - - - add t0, a4, x0 - blez t0, 1f - 0: - esp.zero.qacc - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.ld.128.usar.ip q1, a1, 16 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - addi a1, a1, -16 - add a1, a1, a5 - esp.zero.qacc - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, a5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - j dl_tie718_s8_unaligned_mul2d_11c_remainder - -dl_tie718_s8_unaligned_mul2d_11c_0: - - add t0, a4, x0 - blez t0, 3f - 2: - esp.zero.qacc - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.ld.128.usar.ip q1, a1, 16 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - - addi a1, a1, -16 - add a1, a1, a5 - esp.zero.qacc - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, a5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.vst.128.ip q2, a0, 16 - j dl_tie718_s8_unaligned_mul2d_11c_remainder - -dl_tie718_s8_unaligned_mul2d_11c_1: - - add t0, a4, x0 - blez t0, 5f - 4: - esp.zero.qacc - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.ld.128.usar.ip q1, a1, 16 - esp32p4_s8_64b_unaligned_vector_store q2, a0 - addi t0, t0, -1 - bgtz t0, 4b - 5: - - addi a1, a1, -16 - add a1, a1, a5 - esp.zero.qacc - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, a5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp32p4_s8_64b_unaligned_vector_store q2, a0 - - j dl_tie718_s8_unaligned_mul2d_11c_remainder - -dl_tie718_s8_unaligned_mul2d_11c_small_remainder: - esp.ld.128.usar.xp q0, a1, a5 - esp.movx.r.sar.bytes t6 - - esp.ld.128.usar.xp q3, a2, a5 - esp.movx.r.sar.bytes s0 - -dl_tie718_s8_unaligned_mul2d_11c_remainder: - - - beqz a5, dl_esp32p4_s8_unaligned_mul2d_11c_end - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.zero.qacc - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - - # esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - dl_esp32p4_s8_store_remainder q2, t4, t6, s0, s1, t0, a0, a5 - -dl_esp32p4_s8_unaligned_mul2d_11c_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - - - .align 2 - .text - .global dl_esp32p4_s8_unaligned_mul2d_11c_relu - .type dl_esp32p4_s8_unaligned_mul2d_11c_relu, @function - #.section .iram1 -dl_esp32p4_s8_unaligned_mul2d_11c_relu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: c_remainder - # t3: mul_shift - # s8: activation_alpha - # s9: activation_shift - - - lw a4, 64(a3) - lw a5, 76(a3) - lw t3, 100(a3) - lw s8, 52(a3) - lw s9, 60(a3) - - - - esp.ld.128.usar.ip q5, a0, 0 #get output_ptr sar_byte - esp.movx.r.sar.bytes s1 - - bltz a4, dl_tie718_s8_unaligned_mul2d_11c_relu_small_remainder # channel < 16 - - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - beqz s1, dl_tie718_s8_unaligned_mul2d_11c_relu_0 - li t0, 8 - beq s1, t0, dl_tie718_s8_unaligned_mul2d_11c_relu_1 - - - add t0, a4, x0 - blez t0, 1f - 0: - esp.zero.qacc - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s8 q2, s8, s9 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - addi a1, a1, -16 - add a1, a1, a5 - esp.zero.qacc - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, a5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.vrelu.s8 q2, s8, s9 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - j dl_tie718_s8_unaligned_mul2d_11c_relu_remainder - -dl_tie718_s8_unaligned_mul2d_11c_relu_0: - - add t0, a4, x0 - blez t0, 3f - 2: - esp.zero.qacc - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s8 q2, s8, s9 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - addi a1, a1, -16 - add a1, a1, a5 - esp.zero.qacc - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, a5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.vrelu.s8 q2, s8, s9 - esp.vst.128.ip q2, a0, 16 - j dl_tie718_s8_unaligned_mul2d_11c_relu_remainder - -dl_tie718_s8_unaligned_mul2d_11c_relu_1: - - add t0, a4, x0 - blez t0, 5f - 4: - esp.zero.qacc - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.ld.128.usar.ip q1, a1, 16 - esp.vrelu.s8 q2, s8, s9 - esp32p4_s8_64b_unaligned_vector_store q2, a0 - addi t0, t0, -1 - bgtz t0, 4b - 5: - addi a1, a1, -16 - add a1, a1, a5 - esp.zero.qacc - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, a5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.vrelu.s8 q2, s8, s9 - esp32p4_s8_64b_unaligned_vector_store q2, a0 - j dl_tie718_s8_unaligned_mul2d_11c_relu_remainder - -dl_tie718_s8_unaligned_mul2d_11c_relu_small_remainder: - esp.ld.128.usar.xp q0, a1, a5 - esp.movx.r.sar.bytes t6 - - esp.ld.128.usar.xp q3, a2, a5 - esp.movx.r.sar.bytes s0 - -dl_tie718_s8_unaligned_mul2d_11c_relu_remainder: - - - beqz a5, dl_esp32p4_s8_unaligned_mul2d_11c_relu_end - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.zero.qacc - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.vrelu.s8 q2, s8, s9 - # esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - dl_esp32p4_s8_store_remainder q2, t4, t6, s0, s1, t0, a0, a5 - -dl_esp32p4_s8_unaligned_mul2d_11c_relu_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - - - - .align 2 - .text - .global dl_esp32p4_s8_unaligned_mul2d_11c_prelu - .type dl_esp32p4_s8_unaligned_mul2d_11c_prelu, @function - #.section .iram1 -dl_esp32p4_s8_unaligned_mul2d_11c_prelu: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - - # a0: int8_t *output_ptr - # a1: int8_t *input0_ptr - # a2: int8_t *input1_ptr - # a3: void *args - # a4: c_div_x_1 - # a5: c_remainder - # t3: mul_shift - # s8: activation_alpha_ptr - # s9: activation_shift - - - lw a4, 64(a3) - lw a5, 76(a3) - lw t3, 100(a3) - lw s8, 56(a3) - lw s9, 60(a3) - - - - esp.ld.128.usar.ip q5, a0, 0 #get output_ptr sar_byte - esp.movx.r.sar.bytes s1 - - bltz a4, dl_tie718_s8_unaligned_mul2d_11c_prelu_small_remainder # channel < 16 - - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q3, a2, 16 - esp.ld.128.usar.ip q1, a1, 16 - - beqz s1, dl_tie718_s8_unaligned_mul2d_11c_prelu_0 - li t0, 8 - beq s1, t0, dl_tie718_s8_unaligned_mul2d_11c_prelu_1 - - - add t0, a4, x0 - blez t0, 1f - 0: - esp.zero.qacc - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s8 q2, q2, q6, s9 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - addi t0, t0, -1 - bgtz t0, 0b - 1: - addi a1, a1, -16 - add a1, a1, a5 - esp.zero.qacc - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, a5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.vld.128.ip q6, s8, 16 - esp.srcmb.s8.qacc q2, t3, 1 - esp.vprelu.s8 q2, q2, q6, s9 - esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - j dl_tie718_s8_unaligned_mul2d_11c_prelu_remainder - -dl_tie718_s8_unaligned_mul2d_11c_prelu_0: - - add t0, a4, x0 - blez t0, 3f - 2: - esp.zero.qacc - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s8 q2, q2, q6, s9 - esp.vst.128.ip q2, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b - 3: - addi a1, a1, -16 - add a1, a1, a5 - esp.zero.qacc - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, a5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.vld.128.ip q6, s8, 16 - esp.srcmb.s8.qacc q2, t3, 1 - esp.vprelu.s8 q2, q2, q6, s9 - esp.vst.128.ip q2, a0, 16 - j dl_tie718_s8_unaligned_mul2d_11c_prelu_remainder - -dl_tie718_s8_unaligned_mul2d_11c_prelu_1: - - add t0, a4, x0 - blez t0, 5f - 4: - esp.zero.qacc - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 16 - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.srcmb.s8.qacc q2, t3, 1 - esp.vld.128.ip q6, s8, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.vprelu.s8 q2, q2, q6, s9 - esp32p4_s8_64b_unaligned_vector_store q2, a0 - addi t0, t0, -1 - bgtz t0, 4b - 5: - addi a1, a1, -16 - add a1, a1, a5 - esp.zero.qacc - esp.movx.r.sar.bytes t6 #input0 sar - esp.src.q.qup q2, q0, q1 - - esp.ld.128.usar.xp q4, a2, a5 - esp.movx.r.sar.bytes s0 #input1 sar - esp.src.q.qup q5, q3, q4 - - esp.vmulas.s8.qacc q2, q5 - esp.vld.128.ip q6, s8, 16 - esp.srcmb.s8.qacc q2, t3, 1 - esp.vprelu.s8 q2, q2, q6, s9 - esp32p4_s8_64b_unaligned_vector_store q2, a0 - j dl_tie718_s8_unaligned_mul2d_11c_prelu_remainder - - -dl_tie718_s8_unaligned_mul2d_11c_prelu_small_remainder: - esp.ld.128.usar.xp q0, a1, a5 - esp.movx.r.sar.bytes t6 - - esp.ld.128.usar.xp q3, a2, a5 - esp.movx.r.sar.bytes s0 - -dl_tie718_s8_unaligned_mul2d_11c_prelu_remainder: - - beqz a5, dl_esp32p4_s8_unaligned_mul2d_11c_prelu_end - - esp.ld.128.usar.ip q1, a1, 0 - esp.movx.w.sar.bytes t6 - esp.src.q q2, q0, q1 - - esp.ld.128.usar.ip q4, a2, 0 - esp.movx.w.sar.bytes s0 - esp.src.q q5, q3, q4 - - esp.zero.qacc - esp.vmulas.s8.qacc q2, q5 - esp.vld.128.ip q6, s8, 16 - esp.srcmb.s8.qacc q2, t3, 1 - esp.vprelu.s8 q2, q2, q6, s9 - # esp32p4_s8_32b_unaligned_vector_store q2, a0, s1 - dl_esp32p4_s8_store_remainder q2, t4, t6, s0, s1, t0, a0, a5 - -dl_esp32p4_s8_unaligned_mul2d_11c_prelu_end: - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_resize2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_resize2d.S deleted file mode 100644 index c90f29b8..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_resize2d.S +++ /dev/null @@ -1,128 +0,0 @@ -#include "dl_esp32p4_s8.S" -#include "dl_esp32p4_common.S" - - .align 2 - .text - .global dl_esp32p4_s8_resize2d_nearest_2x2_c1 - .type dl_esp32p4_s8_resize2d_nearest_2x2_c1, @function -dl_esp32p4_s8_resize2d_nearest_2x2_c1: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - # a3: output_x_offset - # a4: output_y_offset - # a5: c_div_x - # t3: remainder - # t4: output_shift - # t5: output_scale - - lw a3, 20(a2) - lw a4, 24(a2) - lw a5, 40(a2) - lw t3, 44(a2) - lw t4, 48(a2) - lw t5, 52(a2) - - # t6 (0, 1) - # s0 (1, 0) - # s1 (1, 1) - add t6, a0, a3 - add s0, a0, a4 - add s1, t6, a4 - - sb t5, 0(sp) - add s8, sp, x0 - esp.vldbc.8.ip q1, s8, 0 # all output_scale - esp.vld.128.ip q0, a1, 16 - - add t0, a5, x0 - blez t0, 1f - 0: - esp.zero.qacc - esp.vmulas.s8.qacc.ld.ip q0, a1, 16, q0, q1 - esp.srcmb.s8.qacc q2, t4, 1 - esp.vst.128.ip q2, a0, 16 - esp.vst.128.ip q2, t6, 16 - esp.vst.128.ip q2, s0, 16 - esp.vst.128.ip q2, s1, 16 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - .align 2 - .text - .global dl_esp32p4_s8_unaligned_resize2d_nearest_2x2_c1 - .type dl_esp32p4_s8_unaligned_resize2d_nearest_2x2_c1, @function -dl_esp32p4_s8_unaligned_resize2d_nearest_2x2_c1: - .align 2 - esp32p4_push_128_stacks_4r s0, s1, s8, s9 - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - # a3: output_x_offset - # a4: output_y_offset - # a5: c_div_x - # t3: remainder - # t4: output_shift - # t5: output_scale - - - lw a3, 20(a2) - lw a4, 24(a2) - lw a5, 40(a2) - lw t3, 44(a2) - lw t4, 48(a2) - lw t5, 52(a2) - - - # t6 (0, 1) - # s0 (1, 0) - # s1 (1, 1) - add t6, a0, a3 - add s0, a0, a4 - add s1, t6, a4 - - sb t5, 0(sp) - add s8, sp, x0 - esp.vldbc.8.ip q3, s8, 0 # all output_scale - esp.ld.128.usar.ip q0, a1, 16 - - add t0, a5, x0 - blez t0, 1f - 0: - esp.zero.qacc - esp.ld.128.usar.ip q1, a1, 16 - esp.src.q.qup q2, q0, q1 - esp.vmulas.s8.qacc q2, q3 - esp.srcmb.s8.qacc q4, t4, 1 - esp32p4_s8_32b_unaligned_vector_store q4, a0, s8 - esp32p4_s8_32b_unaligned_vector_store q4, t6, s8 - esp32p4_s8_32b_unaligned_vector_store q4, s0, s8 - esp32p4_s8_32b_unaligned_vector_store q4, s1, s8 - addi t0, t0, -1 - bgtz t0, 0b - 1: - - bnez t3, dl_esp32p4_s8_unaligned_resize2d_nearest_2x2_c1_remainder - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - -dl_esp32p4_s8_unaligned_resize2d_nearest_2x2_c1_remainder: - esp.zero.qacc - esp.ld.128.usar.ip q1, a1, 16 - esp.src.q.qup q2, q0, q1 - esp.vmulas.s8.qacc q2, q3 - esp.srcmb.s8.qacc q4, t4, 1 - dl_esp32p4_s8_store_remainder q4, t4, t5, s8, s9, t0, a0, t3 - dl_esp32p4_s8_store_remainder q4, t4, t5, s8, s9, t0, t6, t3 - dl_esp32p4_s8_store_remainder q4, t4, t5, s8, s9, t0, s0, t3 - dl_esp32p4_s8_store_remainder q4, t4, t5, s8, s9, t0, s1, t3 - esp32p4_pop_128_stacks_4r s0, s1, s8, s9 - ret - - \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_unaligned_conv2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_unaligned_conv2d.S deleted file mode 100644 index 8d0e6b9c..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_unaligned_conv2d.S +++ /dev/null @@ -1,2636 +0,0 @@ -#include "dl_esp32p4_s8.S" -#include "dl_esp32p4_common.S" - - - -############################################################################################################################################################ -#### -#### esp32p4_s8_unaligned_conv2d_11cn series -#### -############################################################################################################################################################ -.macro esp32p4_s8_unaligned_conv2d_11c16 input_v0, input_front_aligned, input_back_aligned, input_ptr, filter_v0, filter_v1, filter_ptr, c_div_x_1, remainder_c, tmp - # scalar * vecter and accumulate into qacc - # input_ptr += (c_div_x_1 + 1) * 16 in the end - # filter_ptr point to the next 16 bytes in the end - - # input_v0: 16 input elements - # filter_v0: 16 filter elements - # filter_v1: 16 filter elements - # input_ptr: input_ptr - # filter_ptr: filter_ptr - # c_div_x_1: input_channel // 16 - 1 - - bltz \c_div_x_1, 17f # input_channel < 16 - - esp.ld.128.usar.ip \input_front_aligned, \input_ptr, 16 - - - esp.vld.128.ip \filter_v0, \filter_ptr, 16 - esp.vld.128.ip \filter_v1, \filter_ptr, 16 - esp.ld.128.usar.ip \input_back_aligned, \input_ptr, 16 - - beqz \c_div_x_1, 19f - # lp.setup 0, \c_div_x_1, 18f - esp.lp.setup 0, \c_div_x_1, 18f - esp.src.q.qup \input_v0, \input_front_aligned, \input_back_aligned - esp.ld.128.usar.ip \input_back_aligned, \input_ptr, 16 - - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 11 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 12 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 13 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 14 - 18: esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 15 - - 19: - esp.src.q.qup \input_v0, \input_front_aligned, \input_back_aligned - addi \input_ptr, \input_ptr, -16 - add \input_ptr, \input_ptr, \remainder_c #input_ptr and the end of c - - - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 11 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 12 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 13 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 14 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 15 - - - beqz \remainder_c, 16f #no c_remainder - - esp.vld.128.ip \filter_v0, \filter_ptr, 16 - - esp.vld.128.ip \input_back_aligned, \input_ptr, 0 - esp.src.q \input_v0, \input_front_aligned, \input_back_aligned - j 15f - - -17: # input_channel < 16 - esp.vld.128.ip \filter_v0, \filter_ptr, 16 - - esp.ld.128.usar.xp \input_front_aligned, \input_ptr, \remainder_c - esp.vld.128.ip \input_back_aligned, \input_ptr, 0 - esp.src.q \input_v0, \input_front_aligned, \input_back_aligned - - -15: # remainder_c == 15, 0x1111 - andi \tmp, \remainder_c, 8 - beqz \tmp, 7f - - esp.vld.128.ip \filter_v1, \filter_ptr, 16 - - andi \tmp, \remainder_c, 4 - beqz \tmp, 11f - andi \tmp, \remainder_c, 2 - beqz \tmp, 13f - andi \tmp, \remainder_c, 1 - beqz \tmp, 14f - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 11 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 12 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 13 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 14 - j 16f # jump to 16f - -14: # remainder_c == 14, 0x1110 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 11 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 12 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 13 - j 16f # jump to 16f - -13: # remainder_c == 13, 0x1101 - andi \tmp, \remainder_c, 1 - beqz \tmp, 12f - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 11 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 12 - j 16f # jump to 16f - -12: # remainder_c == 12, 0x1100 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 10 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 11 - j 16f # jump to 16f - -11: # remainder_c == 11, 0x1011 - andi \tmp, \remainder_c, 2 - beqz \tmp, 9f - andi \tmp, \remainder_c, 1 - beqz \tmp, 10f - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 9 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 10 - j 16f # jump to 16f -10: # remainder_c == 10, 0x1010 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 8 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 9 - j 16f # jump to 16f -9: # remainder_c == 9, 0x1001 - andi \tmp, \remainder_c, 1 - beqz \tmp, 8f - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 7 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 8 - j 16f # jump to 16f -8: # remainder_c == 8, 0x1000 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 6 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 7 - j 16f # jump to 16f - -7: # remainder == 7, 0x111 - andi \tmp, \remainder_c, 4 - beqz \tmp, 3f - - esp.vld.128.ip \filter_v1, \filter_ptr, 16 - andi \tmp, \remainder_c, 2 - beqz \tmp, 5f - andi \tmp, \remainder_c, 1 - beqz \tmp, 6f - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 5 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 6 - j 16f # jump to 16f - -6: # remainder == 6, 0x110 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 4 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 5 - j 16f # jump to 16f - -5: # remainder == 4, 5 - andi \tmp, \remainder_c, 1 - beqz \tmp, 4f - # remainder == 5, 0x101 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 3 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 4 - j 16f # jump to 16f - -4: # remainder == 4, 0x100 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc.ld.incp \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 2 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 3 - j 16f # jump to 16f - -3: # remainder == 1, 2, 3 - andi \tmp, \remainder_c, 2 - beqz \tmp, 1f - - esp.vld.128.ip \filter_v1, \filter_ptr, 16 - - andi \tmp, \remainder_c, 1 - beqz \tmp, 2f - # remainder == 3, 0x011 - esp.vsmulas.s8.qacc.ld.incp \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 1 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 2 - j 16f # jump to 16f - -2: # remainder == 2, 0x010 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 0 - esp.vsmulas.s8.qacc \filter_v1, \input_v0, 1 - j 16f # jump to 16f - -1: # remainder == 1, 0x001 - esp.vsmulas.s8.qacc \filter_v0, \input_v0, 0 - -16: -.endm - - - -.macro esp32p4_s8_unaligned_conv2d_11c1 input_v, input_front, input_back, filter_v, filter_front, filter_back, input_ptr, filter_ptr, c_div_x_1, c_remainder, c_remainder_shift, zero - # input_v: 16 input elements - # filter_v: 16 filter elements - # input_ptr: input_ptr - # filter_ptr: filter_ptr - # c_div_x_1: input_channel // 16 - 1 - # c_remainder: input_channel % 16 - # c_remainder_shift: 15 - c_remainder - - esp.ld.128.usar.ip \input_front, \input_ptr, 16 - esp.ld.128.usar.ip \filter_front, \filter_ptr, 16 - - bltz \c_div_x_1, 11f // input_channel < 16 - esp.ld.128.usar.ip \input_back, \input_ptr, 16 - - beqz \c_div_x_1, 10f - # esp.lp.setup 0, \c_div_x_1, 9f - # Use the zero register as a loop counter, and the value remains zero after the loop is complete. - mv \zero, \c_div_x_1 - 9: - esp.src.q.qup \input_v, \input_front, \input_back - - esp.ld.128.usar.ip \filter_back, \filter_ptr, 16 - esp.src.q.qup \filter_v, \filter_front, \filter_back - - esp.ld.128.usar.ip \input_back, \input_ptr, 16 - esp.vmulas.s8.xacc \filter_v, \input_v - addi \zero, \zero, -1 - bgtz \zero, 9b - - 10: - // last entire-128b - esp.src.q.qup \input_v, \input_front, \input_back - - esp.ld.128.usar.ip \filter_back, \filter_ptr, 16 - esp.src.q.qup \filter_v, \filter_front, \filter_back - - esp.vmulas.s8.xacc \filter_v, \input_v - - beqz \c_remainder, 12f - - 11: - // c_remainder > 0 - esp.ld.128.usar.xp \input_back, \input_ptr, \c_remainder - esp.src.q.qup \input_v, \input_front, \input_back - - esp.ld.128.usar.xp \filter_back, \filter_ptr, \c_remainder - esp.src.q.qup \filter_v, \filter_front, \filter_back - - esp.slcxxp.2q \input_back, \input_v, \c_remainder_shift, \zero - esp.slcxxp.2q \filter_back, \filter_v, \c_remainder_shift, \zero - - esp.vmulas.s8.xacc \filter_v, \input_v - - 12: - addi \input_ptr, \input_ptr, -16 - addi \filter_ptr, \filter_ptr, -16 -.endm - - - -.macro esp32p4_s8_unaligned_conv2d_11cn_load_args args, filter_ptr, c_div_x_1, n_div_x, mac_shift, c_remainder - lw \filter_ptr, 48(\args) // filter - lw \c_div_x_1, 100(\args) // input_channel / x - 1 - lw \n_div_x, 96(\args) // output_channel / x - lw \mac_shift, 64(\args) // mac_shift - lw \c_remainder, 136(\args) // input_channel % (vector_width / element_width) * sizeof(feature_t) -.endm - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_11cn_bias - .type dl_esp32p4_s8_unaligned_conv2d_11cn_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_11cn_bias: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr / n_remainder bias tmp value - # t4: c_remainder_shift - # t5: zero - # t6: bias_ptr - - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): temp value - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: - # s10: - # s11: - - esp32p4_push_12_stacks_3r s0, s1, s8 // push stacks - li s1, -128 // clamp min value - li s8, 127 // clamp max value - esp32p4_s8_unaligned_conv2d_11cn_load_args a2, a3, t0, t1, a5, a4 - lw t6, 68(a2) // bias - - beqz t1, esp32p4_s8_unaligned_conv2d_11cn_bias_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t3 // t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_11cn_bias_128b - li t2, 8 - beq t3, t2, esp32p4_s8_unaligned_conv2d_11cn_bias_64b - esp32p4_s8_unaligned_conv2d_11cn_bias_32b_multiple_loop: // esp32p4_s8_unaligned_conv2d_11cn_bias_32b - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_bias_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_11cn_bias_n_remainder - - esp32p4_s8_unaligned_conv2d_11cn_bias_64b: - esp32p4_s8_unaligned_conv2d_11cn_bias_64b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_bias_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_11cn_bias_n_remainder - - esp32p4_s8_unaligned_conv2d_11cn_bias_128b: - esp32p4_s8_unaligned_conv2d_11cn_bias_128b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_bias_128b_multiple_loop - - - esp32p4_s8_unaligned_conv2d_11cn_bias_n_remainder: - lw t1, 140(a2) // t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_11cn_bias_n_remainder_end - li t4, 15 - sub t4, t4, a4 // t4: 15 - c_remainder - li t5, 0 // t5: activation_shift = zero - - esp32p4_s8_unaligned_conv2d_11cn_bias_n_remainder_loop: - // mv t3, a1 // t3: input_ptr - esp.zero.xacc - - esp32p4_s8_conv2d_element_bias t6, t3 // t3: tmp for bias - mv t3, a1 // t3: input_ptr - esp32p4_s8_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t4, t5 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_bias_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_11cn_bias_n_remainder_end: - - esp32p4_pop_12_stacks_3r s0, s1, s8 // restore registers - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu - .type dl_esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr / n_remainder bias tmp value - # t4: c_remainder_shift - # t5: zero - # t6: bias_ptr - - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): temp value - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: activation_alpha/_address - # s10: activation_shift - # s11: - - esp32p4_push_20_stacks_5r s0, s1, s8, s9, s10 - - li s1, -128 // clamp min value - li s8, 127 // clamp max value - lw s9, 76(a2) // activation_alpha - lw s10, 84(a2) // activation_shift - esp32p4_s8_unaligned_conv2d_11cn_load_args a2, a3, t0, t1, a5, a4 - lw t6, 68(a2) // bias - - beqz t1, esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t3 // t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_128b - li t2, 8 - beq t3, t2, esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_64b - esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_32b_multiple_loop: // esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_32b - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder - - esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_64b: - esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_64b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder - - esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_128b: - esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_128b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_128b_multiple_loop - - - esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder: - lw t1, 140(a2) // t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder_end - li t4, 15 - sub t4, t4, a4 // t4: 15 - c_remainder - li t5, 0 // t5: activation_shift = zero - - esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder_loop: - // mv t3, a1 // t3: input_ptr - esp.zero.xacc - - esp32p4_s8_conv2d_element_bias t6, t3 // t3: tmp for bias - mv t3, a1 // t3: input_ptr - esp32p4_s8_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t4, t5 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_leakyrelu s0, s9, s10 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder_end: - - esp32p4_pop_20_stacks_5r s0, s1, s8, s9, s10 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_11cn_bias_prelu - .type dl_esp32p4_s8_unaligned_conv2d_11cn_bias_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_11cn_bias_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr / n_remainder bias tmp value - # t4: c_remainder_shift - # t5: zero - # t6: bias_ptr - - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): temp value - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: activation_alpha/_address - # s10: activation_shift - # s11: - - esp32p4_push_20_stacks_5r s0, s1, s8, s9, s10 - - li s1, -128 // clamp min value - li s8, 127 // clamp max value - lw s9, 80(a2) // activation_alpha_ptr - lw s10, 84(a2) // activation_shift - esp32p4_s8_unaligned_conv2d_11cn_load_args a2, a3, t0, t1, a5, a4 - lw t6, 68(a2) // bias - - beqz t1, esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t3 // t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_128b - li t2, 8 - beq t3, t2, esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_64b - esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_32b_multiple_loop: // esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_32b - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_n_remainder - - esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_64b: - esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_64b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_n_remainder - - esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_128b: - esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_128b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_128b_multiple_loop - - - esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_n_remainder: - lw t1, 140(a2) // t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_n_remainder_end - li t4, 15 - sub t4, t4, a4 // t4: 15 - c_remainder - li t5, 0 // t5: activation_shift = zero - - esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_n_remainder_loop: - // mv t3, a1 // t3: input_ptr - esp.zero.xacc - - esp32p4_s8_conv2d_element_bias t6, t3 // t3: tmp for bias - mv t3, a1 // t3: input_ptr - esp32p4_s8_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t4, t5 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_prelu s0, s9, s10 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_11cn_bias_prelu_n_remainder_end: - - esp32p4_pop_20_stacks_5r s0, s1, s8, s9, s10 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_11cn - .type dl_esp32p4_s8_unaligned_conv2d_11cn, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_11cn: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr - # t4: c_remainder_shift - # t5: zero - # t6: - - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): temp value - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: - # s10: - # s11: - - esp32p4_push_12_stacks_3r s0, s1, s8 // push stacks - li s1, -128 // clamp min value - li s8, 127 // clamp max value - esp32p4_s8_unaligned_conv2d_11cn_load_args a2, a3, t0, t1, a5, a4 - - beqz t1, esp32p4_s8_unaligned_conv2d_11cn_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t3 // t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_11cn_128b - li t2, 8 - beq t3, t2, esp32p4_s8_unaligned_conv2d_11cn_64b - esp32p4_s8_unaligned_conv2d_11cn_32b_multiple_loop: // esp32p4_s8_unaligned_conv2d_11cn_32b - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_11cn_n_remainder - - esp32p4_s8_unaligned_conv2d_11cn_64b: - esp32p4_s8_unaligned_conv2d_11cn_64b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_11cn_n_remainder - - esp32p4_s8_unaligned_conv2d_11cn_128b: - esp32p4_s8_unaligned_conv2d_11cn_128b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_128b_multiple_loop - - - esp32p4_s8_unaligned_conv2d_11cn_n_remainder: - lw t1, 140(a2) // t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_11cn_n_remainder_end - li t4, 15 - sub t4, t4, a4 // t4: 15 - c_remainder - li t5, 0 // t5: activation_shift = zero - - esp32p4_s8_unaligned_conv2d_11cn_n_remainder_loop: - mv t3, a1 // t3: input_ptr - esp.zero.xacc - - esp32p4_s8_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t4, t5 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_11cn_n_remainder_end: - - esp32p4_pop_12_stacks_3r s0, s1, s8 // restore registers - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_11cn_leakyrelu - .type dl_esp32p4_s8_unaligned_conv2d_11cn_leakyrelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_11cn_leakyrelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr - # t4: c_remainder_shift - # t5: zero - # t6: - - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): temp value - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: activation_alpha/_address - # s10: activation_shift - # s11: - - esp32p4_push_20_stacks_5r s0, s1, s8, s9, s10 - - li s1, -128 // clamp min value - li s8, 127 // clamp max value - lw s9, 76(a2) // activation_alpha - lw s10, 84(a2) // activation_shift - esp32p4_s8_unaligned_conv2d_11cn_load_args a2, a3, t0, t1, a5, a4 - - beqz t1, esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t3 // t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_128b - li t2, 8 - beq t3, t2, esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_64b - esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_32b_multiple_loop: // esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_32b - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_n_remainder - - esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_64b: - esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_64b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_n_remainder - - esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_128b: - esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_128b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_128b_multiple_loop - - - esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_n_remainder: - lw t1, 140(a2) // t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_n_remainder_end - li t4, 15 - sub t4, t4, a4 // t4: 15 - c_remainder - li t5, 0 // t5: activation_shift = zero - - esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_n_remainder_loop: - mv t3, a1 // t3: input_ptr - esp.zero.xacc - - esp32p4_s8_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t4, t5 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_leakyrelu s0, s9, s10 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_11cn_leakyrelu_n_remainder_end: - - esp32p4_pop_20_stacks_5r s0, s1, s8, s9, s10 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_11cn_prelu - .type dl_esp32p4_s8_unaligned_conv2d_11cn_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_11cn_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr - # t4: c_remainder_shift - # t5: zero - # t6: - - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): temp value - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: activation_alpha/_address - # s10: activation_shift - # s11: - - esp32p4_push_20_stacks_5r s0, s1, s8, s9, s10 - - li s1, -128 // clamp min value - li s8, 127 // clamp max value - lw s9, 80(a2) // activation_alpha_ptr - lw s10, 84(a2) // activation_shift - esp32p4_s8_unaligned_conv2d_11cn_load_args a2, a3, t0, t1, a5, a4 - - beqz t1, esp32p4_s8_unaligned_conv2d_11cn_prelu_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t3 // t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_11cn_prelu_128b - li t2, 8 - beq t3, t2, esp32p4_s8_unaligned_conv2d_11cn_prelu_64b - esp32p4_s8_unaligned_conv2d_11cn_prelu_32b_multiple_loop: // esp32p4_s8_unaligned_conv2d_11cn_prelu_32b - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_prelu_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_11cn_prelu_n_remainder - - esp32p4_s8_unaligned_conv2d_11cn_prelu_64b: - esp32p4_s8_unaligned_conv2d_11cn_prelu_64b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_prelu_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_11cn_prelu_n_remainder - - esp32p4_s8_unaligned_conv2d_11cn_prelu_128b: - esp32p4_s8_unaligned_conv2d_11cn_prelu_128b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_11c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_prelu_128b_multiple_loop - - - esp32p4_s8_unaligned_conv2d_11cn_prelu_n_remainder: - lw t1, 140(a2) // t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_11cn_prelu_n_remainder_end - li t4, 15 - sub t4, t4, a4 // t4: 15 - c_remainder - li t5, 0 // t5: activation_shift = zero - - esp32p4_s8_unaligned_conv2d_11cn_prelu_n_remainder_loop: - mv t3, a1 // t3: input_ptr - esp.zero.xacc - - esp32p4_s8_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t4, t5 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_prelu s0, s9, s10 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_11cn_prelu_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_11cn_prelu_n_remainder_end: - - esp32p4_pop_20_stacks_5r s0, s1, s8, s9, s10 - ret - - - - -############################################################################################################################################################ -#### -#### esp32p4_s8_unaligned_conv2d_33cn series -#### -############################################################################################################################################################ -.macro esp32p4_s8_unaligned_conv2d_33c16 input_v0, input_front, input_back, input_ptr, filter_v0, filter_v1, filter_ptr, c_div_x_1, c_remainder, dilation_x_offset, dilation_y_offset, temp - esp32p4_s8_unaligned_conv2d_11c16 \input_v0, \input_front, \input_back, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \c_remainder, \temp - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_unaligned_conv2d_11c16 \input_v0, \input_front, \input_back, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \c_remainder, \temp - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_unaligned_conv2d_11c16 \input_v0, \input_front, \input_back, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \c_remainder, \temp - add \input_ptr, \input_ptr, \dilation_y_offset - - esp32p4_s8_unaligned_conv2d_11c16 \input_v0, \input_front, \input_back, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \c_remainder, \temp - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_unaligned_conv2d_11c16 \input_v0, \input_front, \input_back, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \c_remainder, \temp - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_unaligned_conv2d_11c16 \input_v0, \input_front, \input_back, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \c_remainder, \temp - add \input_ptr, \input_ptr, \dilation_y_offset - - esp32p4_s8_unaligned_conv2d_11c16 \input_v0, \input_front, \input_back, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \c_remainder, \temp - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_unaligned_conv2d_11c16 \input_v0, \input_front, \input_back, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \c_remainder, \temp - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_unaligned_conv2d_11c16 \input_v0, \input_front, \input_back, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \c_remainder, \temp - # add \input_ptr, \input_ptr, \dilation_y_offset -.endm - - - -.macro esp32p4_s8_unaligned_conv2d_33c1 input_v0, input_front, input_back, filter_v0, filter_front, filter_back, input_ptr, filter_ptr, c_div_x_1, c_remainder, dilation_x_offset, dilation_y_offset, c_remainder_shift, zero - esp32p4_s8_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \c_remainder_shift, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \c_remainder_shift, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \c_remainder_shift, \zero - add \input_ptr, \input_ptr, \dilation_y_offset - - esp32p4_s8_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \c_remainder_shift, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \c_remainder_shift, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \c_remainder_shift, \zero - add \input_ptr, \input_ptr, \dilation_y_offset - - esp32p4_s8_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \c_remainder_shift, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \c_remainder_shift, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - esp32p4_s8_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \c_remainder_shift, \zero - # add \input_ptr, \input_ptr, \dilation_y_offset -.endm - - - -.macro esp32p4_s8_unaligned_conv2d_hwcn_load_args args, filter_ptr, c_div_x_1, n_div_x, mac_shift, c_remainder, dilation_x_offset, dilation_y_offset - esp32p4_s8_unaligned_conv2d_11cn_load_args \args, \filter_ptr, \c_div_x_1, \n_div_x, \mac_shift, \c_remainder - lw \dilation_x_offset, 108(\args) // input dilation x offset - lw \dilation_y_offset, 112(\args) // input dilation y offset -.endm - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_33cn_bias - .type dl_esp32p4_s8_unaligned_conv2d_33cn_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_33cn_bias: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr / n_remainder bias tmp value - # t4: c_remainder_shift - # t5: zero - # t6: bias_ptr - - # a6(not for extension instructions): dilation_y_offset - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): dilation_x_offset - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: - # s10: - # s11: - - esp32p4_push_12_stacks_3r s0, s1, s8 // push stacks - li s1, -128 // clamp min value - li s8, 127 // clamp max value - esp32p4_s8_unaligned_conv2d_hwcn_load_args a2, a3, t0, t1, a5, a4, t2, a6 - lw t6, 68(a2) // bias - - beqz t1, esp32p4_s8_unaligned_conv2d_33cn_bias_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t3 # t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_33cn_bias_128b - li a7, 8 - beq t3, a7, esp32p4_s8_unaligned_conv2d_33cn_bias_64b - esp32p4_s8_unaligned_conv2d_33cn_bias_32b: - esp32p4_s8_unaligned_conv2d_33cn_bias_32b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_bias_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_33cn_bias_n_remainder - - esp32p4_s8_unaligned_conv2d_33cn_bias_64b: - esp32p4_s8_unaligned_conv2d_33cn_bias_64b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_bias_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_33cn_bias_n_remainder - - esp32p4_s8_unaligned_conv2d_33cn_bias_128b: - esp32p4_s8_unaligned_conv2d_33cn_bias_128b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_bias_128b_multiple_loop - - - esp32p4_s8_unaligned_conv2d_33cn_bias_n_remainder: - lw t1, 140(a2) # t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_33cn_bias_n_remainder_end - li t4, 15 - sub t4, t4, a4 # t4: 15 - c_remainder - li t5, 0 # t5: activation_shift = zero - - esp32p4_s8_unaligned_conv2d_33cn_bias_n_remainder_loop: - // mv t3, a1 # t3: input_ptr - esp.zero.xacc - - esp32p4_s8_conv2d_element_bias t6, t3 // t3: tmp for bias - mv t3, a1 // t3: input_ptr - esp32p4_s8_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t2, a6, t4, t5 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_bias_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_33cn_bias_n_remainder_end: - - esp32p4_pop_12_stacks_3r s0, s1, s8 // restore registers - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu - .type dl_esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr - # t4: c_remainder_shift - # t5: zero - # t6: bias_ptr - - # a6(not for extension instructions): dilation_y_offset - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): dilation_x_offset - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: activation_alpha/_address - # s10: activation_shift - # s11: - - esp32p4_push_20_stacks_5r s0, s1, s8, s9, s10 - li s1, -128 // clamp min value - li s8, 127 // clamp max value - lw s9, 76(a2) // activation_alpha - lw s10, 84(a2) // activation_shift - esp32p4_s8_unaligned_conv2d_hwcn_load_args a2, a3, t0, t1, a5, a4, t2, a6 - lw t6, 68(a2) // bias - - beqz t1, esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t3 # t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_128b - li a7, 8 - beq t3, a7, esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_64b - esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_32b: - esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_32b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder - - esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_64b: - esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_64b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder - - esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_128b: - esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_128b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_128b_multiple_loop - - - esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder: - lw t1, 140(a2) # t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder_end - li t4, 15 - sub t4, t4, a4 # t4: 15 - c_remainder - li t5, 0 # t5: activation_shift = zero - - esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder_loop: - // mv t3, a1 # t3: input_ptr - esp.zero.xacc - - esp32p4_s8_conv2d_element_bias t6, t3 // t3: tmp for bias - mv t3, a1 // t3: input_ptr - esp32p4_s8_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t2, a6, t4, t5 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_leakyrelu s0, s9, s10 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder_end: - - esp32p4_pop_20_stacks_5r s0, s1, s8, s9, s10 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_33cn_bias_prelu - .type dl_esp32p4_s8_unaligned_conv2d_33cn_bias_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_33cn_bias_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr - # t4: c_remainder_shift - # t5: zero - # t6: bias_ptr - - # a6(not for extension instructions): dilation_y_offset - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): dilation_x_offset - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: activation_alpha/_address - # s10: activation_shift - # s11: - - esp32p4_push_20_stacks_5r s0, s1, s8, s9, s10 - li s1, -128 // clamp min value - li s8, 127 // clamp max value - lw s9, 80(a2) // activation_alpha_ptr - lw s10, 84(a2) // activation_shift - esp32p4_s8_unaligned_conv2d_hwcn_load_args a2, a3, t0, t1, a5, a4, t2, a6 - lw t6, 68(a2) // bias - - beqz t1, esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t3 # t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_128b - li a7, 8 - beq t3, a7, esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_64b - esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_32b: - esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_32b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_n_remainder - - esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_64b: - esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_64b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_n_remainder - - esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_128b: - esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_128b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_128b_multiple_loop - - - esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_n_remainder: - lw t1, 140(a2) # t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_n_remainder_end - li t4, 15 - sub t4, t4, a4 # t4: 15 - c_remainder - li t5, 0 # t5: activation_shift = zero - - esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_n_remainder_loop: - // mv t3, a1 # t3: input_ptr - esp.zero.xacc - - esp32p4_s8_conv2d_element_bias t6, t3 // t3: tmp for bias - mv t3, a1 // t3: input_ptr - esp32p4_s8_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t2, a6, t4, t5 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_prelu s0, s9, s10 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_33cn_bias_prelu_n_remainder_end: - - esp32p4_pop_20_stacks_5r s0, s1, s8, s9, s10 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_33cn - .type dl_esp32p4_s8_unaligned_conv2d_33cn, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_33cn: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr - # t4: c_remainder_shift - # t5: zero - # t6: - - # a6(not for extension instructions): dilation_y_offset - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): dilation_x_offset - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: - # s10: - # s11: - - esp32p4_push_12_stacks_3r s0, s1, s8 // push stacks - li s1, -128 // clamp min value - li s8, 127 // clamp max value - esp32p4_s8_unaligned_conv2d_hwcn_load_args a2, a3, t0, t1, a5, a4, t2, a6 - - beqz t1, esp32p4_s8_unaligned_conv2d_33cn_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t3 # t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_33cn_128b - li a7, 8 - beq t3, a7, esp32p4_s8_unaligned_conv2d_33cn_64b - esp32p4_s8_unaligned_conv2d_33cn_32b: - esp32p4_s8_unaligned_conv2d_33cn_32b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_33cn_n_remainder - - esp32p4_s8_unaligned_conv2d_33cn_64b: - esp32p4_s8_unaligned_conv2d_33cn_64b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_33cn_n_remainder - - esp32p4_s8_unaligned_conv2d_33cn_128b: - esp32p4_s8_unaligned_conv2d_33cn_128b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_128b_multiple_loop - - - esp32p4_s8_unaligned_conv2d_33cn_n_remainder: - lw t1, 140(a2) # t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_33cn_n_remainder_end - li t4, 15 - sub t4, t4, a4 # t4: 15 - c_remainder - li t5, 0 # t5: activation_shift = zero - - esp32p4_s8_unaligned_conv2d_33cn_n_remainder_loop: - mv t3, a1 # t3: input_ptr - esp.zero.xacc - - esp32p4_s8_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t2, a6, t4, t5 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_33cn_n_remainder_end: - - esp32p4_pop_12_stacks_3r s0, s1, s8 // restore registers - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_33cn_leakyrelu - .type dl_esp32p4_s8_unaligned_conv2d_33cn_leakyrelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_33cn_leakyrelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr - # t4: c_remainder_shift - # t5: zero - # t6: - - # a6(not for extension instructions): dilation_y_offset - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): dilation_x_offset - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: activation_alpha/_address - # s10: activation_shift - # s11: - - esp32p4_push_20_stacks_5r s0, s1, s8, s9, s10 - li s1, -128 // clamp min value - li s8, 127 // clamp max value - lw s9, 76(a2) // activation_alpha - lw s10, 84(a2) // activation_shift - esp32p4_s8_unaligned_conv2d_hwcn_load_args a2, a3, t0, t1, a5, a4, t2, a6 - - beqz t1, esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t3 # t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_128b - li a7, 8 - beq t3, a7, esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_64b - esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_32b: - esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_32b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_n_remainder - - esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_64b: - esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_64b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_n_remainder - - esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_128b: - esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_128b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_128b_multiple_loop - - - esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_n_remainder: - lw t1, 140(a2) # t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_n_remainder_end - li t4, 15 - sub t4, t4, a4 # t4: 15 - c_remainder - li t5, 0 # t5: activation_shift = zero - - esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_n_remainder_loop: - mv t3, a1 # t3: input_ptr - esp.zero.xacc - - esp32p4_s8_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t2, a6, t4, t5 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_leakyrelu s0, s9, s10 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_33cn_leakyrelu_n_remainder_end: - - esp32p4_pop_20_stacks_5r s0, s1, s8, s9, s10 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_33cn_prelu - .type dl_esp32p4_s8_unaligned_conv2d_33cn_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_33cn_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr - # t4: c_remainder_shift - # t5: zero - # t6: - - # a6(not for extension instructions): dilation_y_offset - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): dilation_x_offset - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: activation_alpha/_address - # s10: activation_shift - # s11: - - esp32p4_push_20_stacks_5r s0, s1, s8, s9, s10 - li s1, -128 // clamp min value - li s8, 127 // clamp max value - lw s9, 80(a2) // activation_alpha_ptr - lw s10, 84(a2) // activation_shift - esp32p4_s8_unaligned_conv2d_hwcn_load_args a2, a3, t0, t1, a5, a4, t2, a6 - - beqz t1, esp32p4_s8_unaligned_conv2d_33cn_prelu_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t3 # t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_33cn_prelu_128b - li a7, 8 - beq t3, a7, esp32p4_s8_unaligned_conv2d_33cn_prelu_64b - esp32p4_s8_unaligned_conv2d_33cn_prelu_32b: - esp32p4_s8_unaligned_conv2d_33cn_prelu_32b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_prelu_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_33cn_prelu_n_remainder - - esp32p4_s8_unaligned_conv2d_33cn_prelu_64b: - esp32p4_s8_unaligned_conv2d_33cn_prelu_64b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_prelu_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_33cn_prelu_n_remainder - - esp32p4_s8_unaligned_conv2d_33cn_prelu_128b: - esp32p4_s8_unaligned_conv2d_33cn_prelu_128b_multiple_loop: - mv t3, a1 # t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_33c16 q0, q1, q2, t3, q3, q4, a3, t0, a4, t2, a6, a7 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_prelu_128b_multiple_loop - - - esp32p4_s8_unaligned_conv2d_33cn_prelu_n_remainder: - lw t1, 140(a2) # t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_33cn_prelu_n_remainder_end - li t4, 15 - sub t4, t4, a4 # t4: 15 - c_remainder - li t5, 0 # t5: activation_shift = zero - - esp32p4_s8_unaligned_conv2d_33cn_prelu_n_remainder_loop: - mv t3, a1 # t3: input_ptr - esp.zero.xacc - - esp32p4_s8_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t2, a6, t4, t5 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_prelu s0, s9, s10 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_33cn_prelu_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_33cn_prelu_n_remainder_end: - - esp32p4_pop_20_stacks_5r s0, s1, s8, s9, s10 - ret - - - - -############################################################################################################################################################ -#### -#### esp32p4_s8_unaligned_conv2d_hwcn series -#### -############################################################################################################################################################ -.macro esp32p4_s8_unaligned_conv2d_hwc16 input_v0, input_front, input_back, filter_v0, filter_v1, input_ptr, filter_ptr, c_div_x_1, c_remainder, dilation_x_offset, dilation_y_offset, filter_h, filter_w, args, filter_y_offset, filter_n_offset, temp - lw \filter_h, 52(\args) // filter_height - 21: - lw \filter_w, 56(\args) // filter_width - addi \filter_w, \filter_w, -1 - beqz \filter_w, 22f - 20: - esp32p4_s8_unaligned_conv2d_11c16 \input_v0, \input_front, \input_back, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \c_remainder, \temp - add \input_ptr, \input_ptr, \dilation_x_offset - - addi \filter_w, \filter_w, -1 - bnez \filter_w, 20b - 22: - esp32p4_s8_unaligned_conv2d_11c16 \input_v0, \input_front, \input_back, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \c_remainder, \temp - - add \filter_ptr, \filter_ptr, \filter_y_offset - add \input_ptr, \input_ptr, \dilation_y_offset - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 21b - - add \filter_ptr, \filter_ptr, \filter_n_offset -.endm - - - -.macro esp32p4_s8_unaligned_conv2d_hwc1 input_v0, input_front, input_back, filter_v0, filter_front, filter_back, input_ptr, filter_ptr, c_div_x_1, c_remainder, dilation_x_offset, dilation_y_offset, filter_h, filter_w, args, c_remainder_shift, zero, filter_y_offset, filter_n_offset - lw \filter_h, 52(\args) // filter_height - 21: - lw \filter_w, 56(\args) // filter_width - addi \filter_w, \filter_w, -1 - beqz \filter_w, 22f - 20: - esp32p4_s8_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \c_remainder_shift, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - addi \filter_w, \filter_w, -1 - bnez \filter_w, 20b - 22: - esp32p4_s8_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \c_remainder_shift, \zero - - add \filter_ptr, \filter_ptr, \filter_y_offset - add \input_ptr, \input_ptr, \dilation_y_offset - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 21b - - add \filter_ptr, \filter_ptr, \filter_n_offset -.endm - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_hwcn_bias - .type dl_esp32p4_s8_unaligned_conv2d_hwcn_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_hwcn_bias: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr / n_remainder bias tmp value - # t4: c_remainder_shift - # t5: zero - # t6: bias_ptr - - # a6(not for extension instructions): dilation_y_offset - # a7(not for extension instructions): filter_h - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): dilation_x_offset - # s2(not for extension instructions): filter_w - # s3(not for extension instructions): filter_y_offset - # s4(not for extension instructions): filter_n_offset - # s5(not for extension instructions): tmp value - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: - # s10: - # s11: - - esp32p4_push_28_stacks_7r s2, s3, s4, s5, s0, s1, s8 - li s1, -128 // clamp min value - li s8, 127 // clamp max value - esp32p4_s8_unaligned_conv2d_hwcn_load_args a2, a3, t0, t1, a5, a4, t2, a6 - lw t6, 68(a2) // bias - - beqz t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - lw s3, 60(a2) // s3: filter_y_offset - lw s4, 144(a2) // s4: filter_n_offset - esp.movx.r.sar.bytes t3 // t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_hwcn_bias_128b - li s5, 8 - beq t3, s5, esp32p4_s8_unaligned_conv2d_hwcn_bias_64b - // esp32p4_s8_unaligned_conv2d_hwcn_bias_32b: - esp32p4_s8_unaligned_conv2d_hwcn_bias_32b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_hwcn_bias_n_remainder - - esp32p4_s8_unaligned_conv2d_hwcn_bias_64b: - esp32p4_s8_unaligned_conv2d_hwcn_bias_64b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_hwcn_bias_n_remainder - - esp32p4_s8_unaligned_conv2d_hwcn_bias_128b: - esp32p4_s8_unaligned_conv2d_hwcn_bias_128b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_128b_multiple_loop - - esp32p4_s8_unaligned_conv2d_hwcn_bias_n_remainder: - lw t1, 140(a2) // t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_n_remainder_end - lw s3, 160(a2) // a3: filter_y_offset_unaligned - lw s4, 164(a2) // t3: filter_n_offset_unaligned - lw a3, 168(a2) // a3: filter_ptr_unaligned - - li t4, 15 - sub t4, t4, a4 // t4: 15 - c_remainder - li t5, 0 // t5: zero - - esp32p4_s8_unaligned_conv2d_hwcn_bias_n_remainder_loop: - // mv t3, a1 // t3: input_ptr - esp.zero.xacc - - esp32p4_s8_conv2d_element_bias t6, t3 // t3: tmp for bias - mv t3, a1 // t3: input_ptr - esp32p4_s8_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t2, a6, a7, s2, a2, t4, t5, s3, s4 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_hwcn_bias_n_remainder_end: - - esp32p4_pop_28_stacks_7r s2, s3, s4, s5, s0, s1, s8 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu - .type dl_esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr / n_remainder bias tmp value - # t4: c_remainder_shift - # t5: zero - # t6: bias_ptr - - # a6(not for extension instructions): dilation_y_offset - # a7(not for extension instructions): filter_h - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): dilation_x_offset - # s2(not for extension instructions): filter_w - # s3(not for extension instructions): filter_y_offset - # s4(not for extension instructions): filter_n_offset - # s5(not for extension instructions): tmp value - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: activation_alpha/_address - # s10: activation_shift - # s11: - - esp32p4_push_36_stacks_9r s2, s3, s4, s5, s0, s1, s8, s9, s10 - li s1, -128 // clamp min value - li s8, 127 // clamp max value - lw s9, 76(a2) // activation_alpha - lw s10, 84(a2) // activation_shift - esp32p4_s8_unaligned_conv2d_hwcn_load_args a2, a3, t0, t1, a5, a4, t2, a6 - lw t6, 68(a2) // bias - - beqz t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - lw s3, 60(a2) // s3: filter_y_offset - lw s4, 144(a2) // s4: filter_n_offset - esp.movx.r.sar.bytes t3 // t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_128b - li s5, 8 - beq t3, s5, esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_64b - // esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_32b: - esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_32b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder - - esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_64b: - esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_64b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder - - esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_128b: - esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_128b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_128b_multiple_loop - - esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder: - lw t1, 140(a2) // t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder_end - lw s3, 160(a2) // a3: filter_y_offset_unaligned - lw s4, 164(a2) // t3: filter_n_offset_unaligned - lw a3, 168(a2) // a3: filter_ptr_unaligned - - li t4, 15 - sub t4, t4, a4 // t4: 15 - c_remainder - li t5, 0 // t5: zero - - esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder_loop: - // mv t3, a1 // t3: input_ptr - esp.zero.xacc - - esp32p4_s8_conv2d_element_bias t6, t3 // t3: tmp for bias - mv t3, a1 // t3: input_ptr - esp32p4_s8_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t2, a6, a7, s2, a2, t4, t5, s3, s4 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_leakyrelu s0, s9, s10 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder_end: - - esp32p4_pop_36_stacks_9r s2, s3, s4, s5, s0, s1, s8, s9, s10 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu - .type dl_esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr / n_remainder bias tmp value - # t4: c_remainder_shift - # t5: zero - # t6: bias_ptr - - # a6(not for extension instructions): dilation_y_offset - # a7(not for extension instructions): filter_h - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): dilation_x_offset - # s2(not for extension instructions): filter_w - # s3(not for extension instructions): filter_y_offset - # s4(not for extension instructions): filter_n_offset - # s5(not for extension instructions): tmp value - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: activation_alpha/_address - # s10: activation_shift - # s11: - - esp32p4_push_36_stacks_9r s2, s3, s4, s5, s0, s1, s8, s9, s10 - li s1, -128 // clamp min value - li s8, 127 // clamp max value - lw s9, 80(a2) // activation_alpha_ptr - lw s10, 84(a2) // activation_shift - esp32p4_s8_unaligned_conv2d_hwcn_load_args a2, a3, t0, t1, a5, a4, t2, a6 - lw t6, 68(a2) // bias - - beqz t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - lw s3, 60(a2) // s3: filter_y_offset - lw s4, 144(a2) // s4: filter_n_offset - esp.movx.r.sar.bytes t3 // t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_128b - li s5, 8 - beq t3, s5, esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_64b - // esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_32b: - esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_32b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_n_remainder - - esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_64b: - esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_64b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_n_remainder - - esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_128b: - esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_128b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_conv2d_128b_vector_bias t6 - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_128b_multiple_loop - - esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_n_remainder: - lw t1, 140(a2) // t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_n_remainder_end - lw s3, 160(a2) // a3: filter_y_offset_unaligned - lw s4, 164(a2) // t3: filter_n_offset_unaligned - lw a3, 168(a2) // a3: filter_ptr_unaligned - - li t4, 15 - sub t4, t4, a4 // t4: 15 - c_remainder - li t5, 0 // t5: zero - - esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_n_remainder_loop: - // mv t3, a1 // t3: input_ptr - esp.zero.xacc - - esp32p4_s8_conv2d_element_bias t6, t3 // t3: tmp for bias - mv t3, a1 // t3: input_ptr - esp32p4_s8_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t2, a6, a7, s2, a2, t4, t5, s3, s4 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_prelu s0, s9, s10 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_hwcn_bias_prelu_n_remainder_end: - - esp32p4_pop_36_stacks_9r s2, s3, s4, s5, s0, s1, s8, s9, s10 - ret - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_hwcn - .type dl_esp32p4_s8_unaligned_conv2d_hwcn, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_hwcn: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr - # t4: c_remainder_shift - # t5: zero - # t6: - - # a6(not for extension instructions): dilation_y_offset - # a7(not for extension instructions): filter_h - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): dilation_x_offset - # s2(not for extension instructions): filter_w - # s3(not for extension instructions): filter_y_offset - # s4(not for extension instructions): filter_n_offset - # s5(not for extension instructions): tmp value - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: - # s10: - # s11: - - esp32p4_push_28_stacks_7r s2, s3, s4, s5, s0, s1, s8 - li s1, -128 // clamp min value - li s8, 127 // clamp max value - esp32p4_s8_unaligned_conv2d_hwcn_load_args a2, a3, t0, t1, a5, a4, t2, a6 - - beqz t1, esp32p4_s8_unaligned_conv2d_hwcn_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - lw s3, 60(a2) // s3: filter_y_offset - lw s4, 144(a2) // s4: filter_n_offset - esp.movx.r.sar.bytes t3 // t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_hwcn_128b - li s5, 8 - beq t3, s5, esp32p4_s8_unaligned_conv2d_hwcn_64b - // esp32p4_s8_unaligned_conv2d_hwcn_32b: - esp32p4_s8_unaligned_conv2d_hwcn_32b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_hwcn_n_remainder - - esp32p4_s8_unaligned_conv2d_hwcn_64b: - esp32p4_s8_unaligned_conv2d_hwcn_64b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_hwcn_n_remainder - - esp32p4_s8_unaligned_conv2d_hwcn_128b: - esp32p4_s8_unaligned_conv2d_hwcn_128b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_128b_multiple_loop - - esp32p4_s8_unaligned_conv2d_hwcn_n_remainder: - lw t1, 140(a2) // t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_hwcn_n_remainder_end - lw s3, 160(a2) // a3: filter_y_offset_unaligned - lw s4, 164(a2) // t3: filter_n_offset_unaligned - lw a3, 168(a2) // a3: filter_ptr_unaligned - - li t4, 15 - sub t4, t4, a4 // t4: 15 - c_remainder - li t5, 0 // t5: zero - - esp32p4_s8_unaligned_conv2d_hwcn_n_remainder_loop: - mv t3, a1 // t3: input_ptr - esp.zero.xacc - - esp32p4_s8_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t2, a6, a7, s2, a2, t4, t5, s3, s4 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_hwcn_n_remainder_end: - - esp32p4_pop_28_stacks_7r s2, s3, s4, s5, s0, s1, s8 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu - .type dl_esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr - # t4: c_remainder_shift - # t5: zero - # t6: - - # a6(not for extension instructions): dilation_y_offset - # a7(not for extension instructions): filter_h - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): dilation_x_offset - # s2(not for extension instructions): filter_w - # s3(not for extension instructions): filter_y_offset - # s4(not for extension instructions): filter_n_offset - # s5(not for extension instructions): tmp value - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: activation_alpha/_address - # s10: activation_shift - # s11: - - esp32p4_push_36_stacks_9r s2, s3, s4, s5, s0, s1, s8, s9, s10 - li s1, -128 // clamp min value - li s8, 127 // clamp max value - lw s9, 76(a2) // activation_alpha - lw s10, 84(a2) // activation_shift - esp32p4_s8_unaligned_conv2d_hwcn_load_args a2, a3, t0, t1, a5, a4, t2, a6 - - beqz t1, esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - lw s3, 60(a2) // s3: filter_y_offset - lw s4, 144(a2) // s4: filter_n_offset - esp.movx.r.sar.bytes t3 // t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_128b - li s5, 8 - beq t3, s5, esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_64b - // esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_32b: - esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_32b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_n_remainder - - esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_64b: - esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_64b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_n_remainder - - esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_128b: - esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_128b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_relu q0, s9, s10 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_128b_multiple_loop - - esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_n_remainder: - lw t1, 140(a2) // t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_n_remainder_end - lw s3, 160(a2) // a3: filter_y_offset_unaligned - lw s4, 164(a2) // t3: filter_n_offset_unaligned - lw a3, 168(a2) // a3: filter_ptr_unaligned - - li t4, 15 - sub t4, t4, a4 // t4: 15 - c_remainder - li t5, 0 // t5: zero - - esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_n_remainder_loop: - mv t3, a1 // t3: input_ptr - esp.zero.xacc - - esp32p4_s8_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t2, a6, a7, s2, a2, t4, t5, s3, s4 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_leakyrelu s0, s9, s10 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_hwcn_leakyrelu_n_remainder_end: - - esp32p4_pop_36_stacks_9r s2, s3, s4, s5, s0, s1, s8, s9, s10 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_conv2d_hwcn_prelu - .type dl_esp32p4_s8_unaligned_conv2d_hwcn_prelu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_conv2d_hwcn_prelu: - - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr - # a4: c_remainder - # a5: mac_shift - # t3: output_sar_byte / moving_input_ptr - # t4: c_remainder_shift - # t5: zero - # t6: - - # a6(not for extension instructions): dilation_y_offset - # a7(not for extension instructions): filter_h - # t0(not for extension instructions): c_div_x_1 - # t1(not for extension instructions): n_div_x / n_remainder - # t2(not for extension instructions): dilation_x_offset - # s2(not for extension instructions): filter_w - # s3(not for extension instructions): filter_y_offset - # s4(not for extension instructions): filter_n_offset - # s5(not for extension instructions): tmp value - - # s0: output - # s1: clamp min value - # s8: clamp max value - # s9: activation_alpha/_address - # s10: activation_shift - # s11: - - esp32p4_push_36_stacks_9r s2, s3, s4, s5, s0, s1, s8, s9, s10 - li s1, -128 // clamp min value - li s8, 127 // clamp max value - lw s9, 80(a2) // activation_alpha_ptr - lw s10, 84(a2) // activation_shift - esp32p4_s8_unaligned_conv2d_hwcn_load_args a2, a3, t0, t1, a5, a4, t2, a6 - - beqz t1, esp32p4_s8_unaligned_conv2d_hwcn_prelu_n_remainder - esp.ld.128.usar.ip q0, a0, 0 - lw s3, 60(a2) // s3: filter_y_offset - lw s4, 144(a2) // s4: filter_n_offset - esp.movx.r.sar.bytes t3 // t3: output_sar_byte - - beqz t3, esp32p4_s8_unaligned_conv2d_hwcn_prelu_128b - li s5, 8 - beq t3, s5, esp32p4_s8_unaligned_conv2d_hwcn_prelu_64b - // esp32p4_s8_unaligned_conv2d_hwcn_prelu_32b: - esp32p4_s8_unaligned_conv2d_hwcn_prelu_32b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t3 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_prelu_32b_multiple_loop - j esp32p4_s8_unaligned_conv2d_hwcn_prelu_n_remainder - - esp32p4_s8_unaligned_conv2d_hwcn_prelu_64b: - esp32p4_s8_unaligned_conv2d_hwcn_prelu_64b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_prelu_64b_multiple_loop - j esp32p4_s8_unaligned_conv2d_hwcn_prelu_n_remainder - - esp32p4_s8_unaligned_conv2d_hwcn_prelu_128b: - esp32p4_s8_unaligned_conv2d_hwcn_prelu_128b_multiple_loop: - mv t3, a1 // t3: input_ptr - esp.zero.qacc - - esp32p4_s8_unaligned_conv2d_hwc16 q0, q1, q2, q3, q4, t3, a3, t0, a4, t2, a6, a7, s2, a2, s3, s4, s5 - esp32p4_s8_128b_vector_shift_result q0, a5 - esp32p4_s8_128b_vector_prelu q0, q1, s9, s10 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_prelu_128b_multiple_loop - - esp32p4_s8_unaligned_conv2d_hwcn_prelu_n_remainder: - lw t1, 140(a2) // t1: n_remainder - beqz t1, esp32p4_s8_unaligned_conv2d_hwcn_prelu_n_remainder_end - lw s3, 160(a2) // a3: filter_y_offset_unaligned - lw s4, 164(a2) // t3: filter_n_offset_unaligned - lw a3, 168(a2) // a3: filter_ptr_unaligned - - li t4, 15 - sub t4, t4, a4 // t4: 15 - c_remainder - li t5, 0 // t5: zero - - esp32p4_s8_unaligned_conv2d_hwcn_prelu_n_remainder_loop: - mv t3, a1 // t3: input_ptr - esp.zero.xacc - - esp32p4_s8_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, t3, a3, t0, a4, t2, a6, a7, s2, a2, t4, t5, s3, s4 - esp32p4_s8_element_result s0, a5 - esp32p4_clamp s0, s1, s8 - esp32p4_s8_element_prelu s0, s9, s10 - esp32p4_s8_element_store a0, s0 - - addi t1, t1, -1 - bnez t1, esp32p4_s8_unaligned_conv2d_hwcn_prelu_n_remainder_loop - - esp32p4_s8_unaligned_conv2d_hwcn_prelu_n_remainder_end: - - esp32p4_pop_36_stacks_9r s2, s3, s4, s5, s0, s1, s8, s9, s10 - ret diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_unaligned_depthwise_conv2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_unaligned_depthwise_conv2d.S deleted file mode 100644 index 40728751..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_unaligned_depthwise_conv2d.S +++ /dev/null @@ -1,1348 +0,0 @@ -#include "dl_esp32p4_s8.S" -#include "dl_esp32p4_common.S" - - -############################################################################################################################################################ -#### -#### esp32p4_s8_unaligned_depthwise_conv2d_33c1 series -#### -############################################################################################################################################################ -.macro esp32p4_s8_unaligned_depthwise_conv2d_33s1 input_v0, input_v1, input_v2, input_back, filter_v0, filter_v1, filter_v2, input_ptr, filter_ptr, dilation_x_offset_16, dilation_y_offset_16, next_33s1_16 - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_y_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.src.q.ld.ip \input_v0, \input_ptr, 16, \input_v2, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - esp.src.q.ld.ip \input_v1, \input_ptr, 16, \input_v0, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - esp.src.q.ld.ip \input_v2, \input_ptr, 16, \input_v1, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_y_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.src.q.ld.ip \input_v0, \input_ptr, 16, \input_v2, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - esp.src.q.ld.ip \input_v1, \input_ptr, 16, \input_v0, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - esp.src.q.ld.ip \input_v2, \input_ptr, 16, \input_v1, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \next_33s1_16 - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.src.q.ld.ip \input_v0, \input_ptr, 16, \input_v2, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - esp.src.q.ld.ip \input_v1, \input_ptr, 16, \input_v0, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - esp.src.q.ld.ip \input_v2, \input_ptr, 16, \input_v1, \input_back -.endm - - - -.macro esp32p4_s8_unaligned_depthwise_conv2d_33s1_last input_v0, input_v1, input_v2, input_back, filter_v0, filter_v1, input_ptr, filter_ptr, dilation_x_offset_16, dilation_y_offset_16, next_33s1_16 - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_y_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.src.q.ld.ip \input_v0, \input_ptr, 16, \input_v2, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - esp.src.q.ld.ip \input_v1, \input_ptr, 16, \input_v0, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v2, \filter_v0 - esp.src.q.ld.ip \input_v2, \input_ptr, 16, \input_v1, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_y_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v0, \filter_v1 - esp.src.q.ld.ip \input_v0, \input_ptr, 16, \input_v2, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v1, \filter_v0 - esp.src.q.ld.ip \input_v1, \input_ptr, 16, \input_v0, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v2, \filter_v1 - esp.src.q.ld.ip \input_v2, \input_ptr, 16, \input_v1, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \next_33s1_16 - esp.vmulas.s8.qacc.ld.ip \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - esp.src.q \input_v2, \input_v2, \input_back - - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - esp.vmulas.s8.qacc \input_v2, \filter_v0 -.endm - - - -.macro esp32p4_s8_unaligned_depthwise_conv2d_11r1 input_v0, input_front, input_back, filter_v0, filter_front, filter_back, input_ptr, filter_ptr, c_remainder, forward - esp.ld.128.usar.xp \input_v0, \input_ptr, \c_remainder - esp.vld.128.xp \input_back, \input_ptr, \forward - esp.src.q \input_v0, \input_v0, \input_back - - esp.ld.128.usar.xp \filter_v0, \filter_ptr, \c_remainder - esp.vld.128.ip \filter_back, \filter_ptr, 0 - esp.src.q \filter_v0, \filter_v0, \filter_back - - esp.vmulas.s8.qacc \input_v0, \filter_v0 -.endm - - - -.macro esp32p4_s8_unaligned_depthwise_conv2d_33r1 input_v0, input_front, input_back, filter_v0, filter_front, filter_back, input_ptr, filter_ptr, dilation_x_offset_c_remainder, dilation_y_offset_c_remainder, c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_c_remainder - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_c_remainder - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_y_offset_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_c_remainder - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_c_remainder - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_y_offset_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_c_remainder - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_c_remainder - // esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_y_offset_c_remainder - - esp.ld.128.usar.xp \input_v0, \input_ptr, \c_remainder - esp.vld.128.ip \input_back, \input_ptr, 0 - esp.src.q \input_v0, \input_v0, \input_back - - esp.ld.128.usar.xp \filter_v0, \filter_ptr, \c_remainder - esp.vld.128.ip \filter_back, \filter_ptr, 0 - esp.src.q \filter_v0, \filter_v0, \filter_back - - esp.vmulas.s8.qacc \input_v0, \filter_v0 -.endm - - - -.macro esp32p4_s8_unaligned_depthwise_conv2d_33c1_load_args args, filter_ptr, dilation_x_offset, dilation_y_offset, next_hwx1, c_div_x_1, mac_shift - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - # next_hwx1 = (-(filter_width - 1) * dilation_x * input_channel_with_padding - (filter_height - 1) * dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) + 16 - - lw \filter_ptr, 48(\args) - lw \dilation_x_offset, 124(\args) - lw \dilation_y_offset, 128(\args) - lw \next_hwx1, 132(\args) - lw \c_div_x_1, 100(\args) - lw \mac_shift, 64 (\args) -.endm - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias - .type dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias: - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr / tmp value - # a4: input dilation x offset / tmp value - # a5: input dilation y offset / tmp value - # t3: next_33s1 / tmp value - # t4: mac_shift - # t5: c_div_x_1 / c_remainder - # t6: output_sar_byte - - # a6(not for extension instructions): tmp value - # a7(not for extension instructions): - # t0(not for extension instructions): tmp value - # t1(not for extension instructions): - # t2(not for extension instructions): - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: bias_ptr - # s9: - # s10: - # s11: - - esp32p4_push_4_stacks_1r s8 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_load_args a2, a3, a4, a5, t3, t5, t4 - lw s8, 68(a2) // bias - - addi a4, a4, -16 // a4: dilation_x_offset - 16 - addi a5, a5, -16 // a5: dilation_y_offset - 16 - addi t3, t3, -16 // t3: next_33s1 - 16 - - bltz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_c_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t6 // t6: output_sar_byte - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.xp q3, a1, a4 - esp.src.q.ld.ip q4, a3, 16, q0, q3 // q4: filter_v0; q0: input_v0 - - esp.ld.128.usar.ip q1, a1, 16 - esp.ld.128.usar.xp q3, a1, a4 - esp.src.q.ld.ip q2, a1, 16, q1, q3 // q2: input_v2; q1: input_v1 - - beqz t6, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_128b - li a6, 8 - beq t6, a6, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_64b - // esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_32b: - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_32b_last - esp.lp.setup 0, t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_32b_loop_end - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, q6, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_32b_loop_end: - esp32p4_s8_32b_unaligned_vector_store q3, a0, t6 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_32b_last: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_32b_unaligned_vector_store q3, a0, t6 - - j esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_64b: - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_64b_last - esp.lp.setup 0, t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_64b_loop_end - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, q6, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_64b_loop_end: - esp32p4_s8_64b_unaligned_vector_store q3, a0 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_64b_last: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_64b_unaligned_vector_store q3, a0 - - j esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_128b: - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_128b_last - esp.lp.setup 0, t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_128b_loop_end - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, q6, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_128b_loop_end: - esp32p4_s8_128b_aligned_vector_store q3, a0 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_128b_last: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_aligned_vector_store q3, a0 - - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_c_remainder: - lw t5, 136(a2) // t5: c_remainder - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_c_remainder_end - esp.zero.qacc - - addi a4, a4, 16 - addi a5, a5, 16 - - sub a4, a4, t5 // a4: dilation_x_offset - c_remainder - sub a5, a5, t5 // a5: dilation_y_offset - c_remainder - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33r1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t5 - esp32p4_s8_128b_vector_shift_result q0, t4 - dl_esp32p4_s8_store_remainder q0, a3, a4, a5, t3, t0, a0, t5 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_c_remainder_end: - - esp32p4_pop_4_stacks_1r s8 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu - .type dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu: - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr / tmp value - # a4: input dilation x offset / tmp value - # a5: input dilation y offset / tmp value - # t3: next_33s1 / tmp value - # t4: mac_shift - # t5: c_div_x_1 / c_remainder - # t6: output_sar_byte - - # a6(not for extension instructions): tmp value - # a7(not for extension instructions): - # t0(not for extension instructions): tmp value - # t1(not for extension instructions): - # t2(not for extension instructions): - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: activation_alpha/_address - # s1: activation_shift - # s8: bias_ptr - # s9: - # s10: - # s11: - - esp32p4_push_12_stacks_3r s0, s1, s8 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_load_args a2, a3, a4, a5, t3, t5, t4 - lw s0, 76(a2) // activation_alpha - lw s1, 84(a2) // activation_shift - lw s8, 68(a2) // bias - - addi a4, a4, -16 // a4: dilation_x_offset - 16 - addi a5, a5, -16 // a5: dilation_y_offset - 16 - addi t3, t3, -16 // t3: next_33s1 - 16 - - bltz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_c_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t6 // t6: output_sar_byte - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.xp q3, a1, a4 - esp.src.q.ld.ip q4, a3, 16, q0, q3 // q4: filter_v0; q0: input_v0 - - esp.ld.128.usar.ip q1, a1, 16 - esp.ld.128.usar.xp q3, a1, a4 - esp.src.q.ld.ip q2, a1, 16, q1, q3 // q2: input_v2; q1: input_v1 - - beqz t6, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_128b - li a6, 8 - beq t6, a6, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_64b - // esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_32b: - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_32b_last - esp.lp.setup 0, t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_32b_loop_end - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, q6, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_32b_loop_end: - esp32p4_s8_32b_unaligned_vector_store q3, a0, t6 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_32b_last: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp32p4_s8_32b_unaligned_vector_store q3, a0, t6 - - j esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_64b: - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_64b_last - esp.lp.setup 0, t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_64b_loop_end - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, q6, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_64b_loop_end: - esp32p4_s8_64b_unaligned_vector_store q3, a0 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_64b_last: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp32p4_s8_64b_unaligned_vector_store q3, a0 - - j esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_128b: - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_128b_last - esp.lp.setup 0, t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_128b_loop_end - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, q6, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_128b_loop_end: - esp32p4_s8_128b_aligned_vector_store q3, a0 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_128b_last: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp32p4_s8_128b_aligned_vector_store q3, a0 - - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_c_remainder: - lw t5, 136(a2) // t5: c_remainder - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_c_remainder_end - esp.zero.qacc - - addi a4, a4, 16 - addi a5, a5, 16 - - sub a4, a4, t5 // a4: dilation_x_offset - c_remainder - sub a5, a5, t5 // a5: dilation_y_offset - c_remainder - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_33r1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t5 - esp32p4_s8_128b_vector_shift_result q0, t4 - esp32p4_s8_128b_vector_relu q0, s0, s1 - dl_esp32p4_s8_store_remainder q0, a3, a4, a5, t3, t0, a0, t5 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_bias_relu_c_remainder_end: - - esp32p4_pop_12_stacks_3r s0, s1, s8 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1 - .type dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1: - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr / tmp value - # a4: input dilation x offset / tmp value - # a5: input dilation y offset / tmp value - # t3: next_33s1 / tmp value - # t4: mac_shift - # t5: c_div_x_1 / c_remainder - # t6: output_sar_byte - - # a6(not for extension instructions): tmp value - # a7(not for extension instructions): - # t0(not for extension instructions): tmp value - # t1(not for extension instructions): - # t2(not for extension instructions): - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_load_args a2, a3, a4, a5, t3, t5, t4 - - addi a4, a4, -16 // a4: dilation_x_offset - 16 - addi a5, a5, -16 // a5: dilation_y_offset - 16 - addi t3, t3, -16 // t3: next_33s1 - 16 - - bltz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_c_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t6 // t6: output_sar_byte - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.xp q3, a1, a4 - esp.src.q.ld.ip q4, a3, 16, q0, q3 // q4: filter_v0; q0: input_v0 - - esp.ld.128.usar.ip q1, a1, 16 - esp.ld.128.usar.xp q3, a1, a4 - esp.src.q.ld.ip q2, a1, 16, q1, q3 // q2: input_v2; q1: input_v1 - - beqz t6, esp32p4_s8_unaligned_depthwise_conv2d_33c1_128b - li a6, 8 - beq t6, a6, esp32p4_s8_unaligned_depthwise_conv2d_33c1_64b - // esp32p4_s8_unaligned_depthwise_conv2d_33c1_32b: - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_32b_last - esp.lp.setup 0, t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_32b_loop_end - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, q6, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_32b_loop_end: - esp32p4_s8_32b_unaligned_vector_store q3, a0, t6 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_32b_last: - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_32b_unaligned_vector_store q3, a0, t6 - - j esp32p4_s8_unaligned_depthwise_conv2d_33c1_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_64b: - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_64b_last - esp.lp.setup 0, t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_64b_loop_end - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, q6, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_64b_loop_end: - esp32p4_s8_64b_unaligned_vector_store q3, a0 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_64b_last: - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_64b_unaligned_vector_store q3, a0 - - j esp32p4_s8_unaligned_depthwise_conv2d_33c1_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_128b: - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_128b_last - esp.lp.setup 0, t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_128b_loop_end - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, q6, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_128b_loop_end: - esp32p4_s8_128b_aligned_vector_store q3, a0 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_128b_last: - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_aligned_vector_store q3, a0 - - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_c_remainder: - lw t5, 136(a2) // t5: c_remainder - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_c_remainder_end - esp.zero.qacc - - addi a4, a4, 16 - addi a5, a5, 16 - - sub a4, a4, t5 // a4: dilation_x_offset - c_remainder - sub a5, a5, t5 // a5: dilation_y_offset - c_remainder - esp32p4_s8_unaligned_depthwise_conv2d_33r1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t5 - esp32p4_s8_128b_vector_shift_result q0, t4 - dl_esp32p4_s8_store_remainder q0, a3, a4, a5, t3, t0, a0, t5 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_c_remainder_end: - - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu - .type dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu: - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr / tmp value - # a4: input dilation x offset / tmp value - # a5: input dilation y offset / tmp value - # t3: next_33s1 / tmp value - # t4: mac_shift - # t5: c_div_x_1 / c_remainder - # t6: output_sar_byte - - # a6(not for extension instructions): tmp value - # a7(not for extension instructions): - # t0(not for extension instructions): tmp value - # t1(not for extension instructions): - # t2(not for extension instructions): - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: activation_alpha/_address - # s1: activation_shift - # s8: - # s9: - # s10: - # s11: - - esp32p4_push_12_stacks_3r s0, s1, s8 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_load_args a2, a3, a4, a5, t3, t5, t4 - lw s0, 76(a2) // activation_alpha - lw s1, 84(a2) // activation_shift - - addi a4, a4, -16 // a4: dilation_x_offset - 16 - addi a5, a5, -16 // a5: dilation_y_offset - 16 - addi t3, t3, -16 // t3: next_33s1 - 16 - - bltz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_c_remainder - esp.ld.128.usar.ip q0, a0, 0 - esp.movx.r.sar.bytes t6 // t6: output_sar_byte - - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.xp q3, a1, a4 - esp.src.q.ld.ip q4, a3, 16, q0, q3 // q4: filter_v0; q0: input_v0 - - esp.ld.128.usar.ip q1, a1, 16 - esp.ld.128.usar.xp q3, a1, a4 - esp.src.q.ld.ip q2, a1, 16, q1, q3 // q2: input_v2; q1: input_v1 - - beqz t6, esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_128b - li a6, 8 - beq t6, a6, esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_64b - // esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_32b: - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_32b_last - esp.lp.setup 0, t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_32b_loop_end - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, q6, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_32b_loop_end: - esp32p4_s8_32b_unaligned_vector_store q3, a0, t6 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_32b_last: - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp32p4_s8_32b_unaligned_vector_store q3, a0, t6 - - j esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_64b: - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_64b_last - esp.lp.setup 0, t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_64b_loop_end - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, q6, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_64b_loop_end: - esp32p4_s8_64b_unaligned_vector_store q3, a0 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_64b_last: - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp32p4_s8_64b_unaligned_vector_store q3, a0 - - j esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_128b: - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_128b_last - esp.lp.setup 0, t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_128b_loop_end - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, q6, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_128b_loop_end: - esp32p4_s8_128b_aligned_vector_store q3, a0 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_128b_last: - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t3 - esp32p4_s8_128b_vector_shift_result q3, t4 - esp32p4_s8_128b_vector_relu q3, s0, s1 - esp32p4_s8_128b_aligned_vector_store q3, a0 - - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_c_remainder: - lw t5, 136(a2) // t5: c_remainder - beqz t5, esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_c_remainder_end - esp.zero.qacc - - addi a4, a4, 16 - addi a5, a5, 16 - - sub a4, a4, t5 // a4: dilation_x_offset - c_remainder - sub a5, a5, t5 // a5: dilation_y_offset - c_remainder - esp32p4_s8_unaligned_depthwise_conv2d_33r1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t5 - esp32p4_s8_128b_vector_shift_result q0, t4 - esp32p4_s8_128b_vector_relu q0, s0, s1 - dl_esp32p4_s8_store_remainder q0, a3, a4, a5, t3, t0, a0, t5 - - esp32p4_s8_unaligned_depthwise_conv2d_33c1_relu_c_remainder_end: - - esp32p4_pop_12_stacks_3r s0, s1, s8 - ret - - - - - - -############################################################################################################################################################ -#### -#### esp32p4_s8_unaligned_depthwise_conv2d_hwc1 series -#### -############################################################################################################################################################ -.macro esp32p4_s8_unaligned_depthwise_conv2d_1ws1 input_v0, input_v1, input_back, input_ptr, filter_v0, filter_ptr, dilation_x_offset_16, dilation_y_offset, filter_w, filter_w_rs1_1, filter_y_offset, temp - blez \filter_w_rs1_1, 1f - esp.lp.setup 0, \filter_w_rs1_1, 0f - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - esp.src.q.ld.ip \input_v0, \input_ptr, 16, \input_v1, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v1, \filter_v0 - 0: esp.src.q.ld.ip \input_v1, \input_ptr, 16, \input_v0, \input_back - - 1: - andi \temp, \filter_w, 0xfffffffe - beq \filter_w, \temp, 2f - # three 8-input-element left - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - esp.src.q.ld.ip \input_v0, \input_ptr, 16, \input_v1, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_y_offset - esp.vmulas.s8.qacc.ld.xp \filter_v0, \filter_ptr, \filter_y_offset, \input_v1, \filter_v0 - esp.src.q.ld.ip \input_v1, \input_ptr, 16, \input_v0, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - esp.src.q \input_v0, \input_v1, \input_back - esp.ld.128.usar.ip \input_v1, \input_ptr, 16 - j 3f - - 2: # two 8-input-element left - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_y_offset - esp.vmulas.s8.qacc.ld.xp \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - esp.src.q.ld.ip \input_v0, \input_ptr, 16, \input_v1, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v1, \filter_v0 - esp.src.q.ld.ip \input_v1, \input_ptr, 16, \input_v0, \input_back - 3: -.endm - - - -.macro esp32p4_s8_unaligned_depthwise_conv2d_1ws1_last input_v0, input_v1, input_back, input_ptr, filter_v0, filter_ptr, dilation_x_offset_16, filter_w, filter_w_rs1_1, next_hws1, filter_y_offset, temp - blez \filter_w_rs1_1, 5f - esp.lp.setup 0, \filter_w_rs1_1, 4f - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - esp.src.q.ld.ip \input_v0, \input_ptr, 16, \input_v1, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v1, \filter_v0 - 4: esp.src.q.ld.ip \input_v1, \input_ptr, 16, \input_v0, \input_back - - 5: - andi \temp, \filter_w, 0xfffffffe - beq \filter_w, \temp, 6f - # three 8-input-element left - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vmulas.s8.qacc.ld.ip \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - esp.src.q.ld.ip \input_v0, \input_ptr, 16, \input_v1, \input_back - - esp.ld.128.usar.xp \input_back, \input_ptr, \next_hws1 - esp.vmulas.s8.qacc.ld.xp \filter_v0, \filter_ptr, \filter_y_offset, \input_v1, \filter_v0 - esp.src.q \input_v0, \input_v0, \input_back - - esp.vmulas.s8.qacc \input_v0, \filter_v0 - j 7f - - 6: # two 8-input-element left - esp.ld.128.usar.xp \input_back, \input_ptr, \next_hws1 - esp.vmulas.s8.qacc.ld.xp \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - esp.src.q \input_v1, \input_v1, \input_back - - esp.vmulas.s8.qacc \input_v1, \filter_v0 - 7: -.endm - - - -.macro esp32p4_s8_unaligned_depthwise_conv2d_hws1 input_v0, input_v1, input_back, filter_v0, input_ptr, filter_ptr, dilation_x_offset_16, dilation_y_offset_16, next_hws1, filter_h, filter_w, filter_w_rs1_1, args, filter_y_offset, filter_n_offset, temp - lw \filter_h, 52(\args) # filter_height - // lw \filter_w, 56(\args) # filter_width - - addi \temp, \filter_w, -1 - beqz \temp, 10f - // filter_w >= 2 - esp.ld.128.usar.ip \input_v1, \input_ptr, 16 - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_x_offset_16 - esp.vld.128.ip \filter_v0, \filter_ptr, 16 // filter_v0 - esp.src.q \input_v0, \input_v1, \input_back // input_v0 - esp.ld.128.usar.ip \input_v1, \input_ptr, 16 - - addi \filter_h, \filter_h, -1 - beqz \filter_h, 9f - 8: - esp32p4_s8_unaligned_depthwise_conv2d_1ws1 \input_v0, \input_v1, \input_back, \input_ptr, \filter_v0, \filter_ptr, \dilation_x_offset_16, \dilation_y_offset_16, \filter_w, \filter_w_rs1_1, \filter_y_offset, \temp - addi \filter_h, \filter_h, -1 - bgtz \filter_h, 8b - 9: // last y - esp32p4_s8_unaligned_depthwise_conv2d_1ws1_last \input_v0, \input_v1, \input_back, \input_ptr, \filter_v0, \filter_ptr, \dilation_x_offset_16, \filter_w, \filter_w_rs1_1, \next_hws1, \filter_y_offset, \temp - j 13f - - 10: // filter_w == 1 - esp.ld.128.usar.ip \input_v1, \input_ptr, 16 - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_y_offset_16 - esp.vld.128.xp \filter_v0, \filter_ptr, \filter_y_offset // filter_v0 - esp.src.q \input_v0, \input_v1, \input_back // input_v0 - - addi \filter_h, \filter_h, -1 - beqz \filter_h, 12f - // esp.lp.setup 0, \filter_h, 11f - 11: - esp.ld.128.usar.ip \input_v1, \input_ptr, 16 - esp.ld.128.usar.xp \input_back, \input_ptr, \dilation_y_offset_16 - esp.vmulas.s8.qacc.ld.xp \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - esp.src.q \input_v0, \input_v1, \input_back - addi \filter_h, \filter_h, -1 - bgtz \filter_h, 11b - - 12: // last y - esp.vmulas.s8.qacc \input_v0, \filter_v0 - sub \input_ptr, \input_ptr, \dilation_y_offset_16 - add \input_ptr, \input_ptr, \next_hws1 - - 13: - - add \filter_ptr, \filter_ptr, \filter_n_offset -.endm - - - -.macro esp32p4_s8_unaligned_depthwise_conv2d_11r1_padding input_v0, input_front, input_back, filter_v0, filter_front, filter_back, input_ptr, filter_ptr, c_remainder, forward, filter_y_offset - esp.ld.128.usar.xp \input_v0, \input_ptr, \c_remainder - esp.vld.128.xp \input_back, \input_ptr, \forward - esp.src.q \input_v0, \input_v0, \input_back - - esp.ld.128.usar.xp \filter_v0, \filter_ptr, \c_remainder - esp.vld.128.xp \filter_back, \filter_ptr, \filter_y_offset - esp.src.q \filter_v0, \filter_v0, \filter_back - - esp.vmulas.s8.qacc \input_v0, \filter_v0 -.endm - - - -.macro esp32p4_s8_unaligned_depthwise_conv2d_hwr1 input_v0, input_front, input_back, filter_v0, filter_front, filter_back, input_ptr, filter_ptr, dilation_x_offset_c_remainder, dilation_y_offset_c_remainder, filter_h, filter_w, filter_w_rs1_1, c_remainder, args, filter_y_offset, temp - lw \filter_h, 52(\args) // filter_height - // lw \filter_w, 56(\args) // filter_width - - addi \temp, \filter_w, -1 - beqz \temp, 19f - 18: - beqz \filter_w_rs1_1, 15f - esp.lp.setup 0, \filter_w_rs1_1, 14f - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_c_remainder - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_c_remainder - 14: nop - 15: - andi \temp, \filter_w, 0xfffffffe - beq \temp, \filter_w, 16f - # 3 left - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_c_remainder - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_c_remainder - esp32p4_s8_unaligned_depthwise_conv2d_11r1_padding \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_y_offset_c_remainder, \filter_y_offset - j 17f - 16: - # 2 left - esp32p4_s8_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_c_remainder - esp32p4_s8_unaligned_depthwise_conv2d_11r1_padding \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_y_offset_c_remainder, \filter_y_offset - 17: - addi \filter_h, \filter_h, -1 - bgtz \filter_h, 18b - - j 21f - - 19: - # filter_w == 1 - beqz \filter_h, 21f - esp.lp.setup 0, \filter_h, 20f - esp32p4_s8_unaligned_depthwise_conv2d_11r1_padding \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_y_offset_c_remainder, \filter_y_offset - 20: nop - 21: -.endm - - - -.macro esp32p4_s8_unaligned_depthwise_conv2d_hwc1_load_args args, filter_ptr, dilation_x_offset, dilation_y_offset, next_hwx1, c_div_x_1, mac_shift, filter_w, filter_w_rs1_1 - esp32p4_s8_unaligned_depthwise_conv2d_33c1_load_args \args, \filter_ptr, \dilation_x_offset, \dilation_y_offset, \next_hwx1, \c_div_x_1, \mac_shift - lw \filter_w, 56(\args) - lw \filter_w_rs1_1, 148(\args) -.endm - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias - .type dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias: - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr / unaligned_filter_ptr / tmp value - # a4: input dilation x offset / tmp value - # a5: input dilation y offset / tmp value - # t3: next_hwx1 / c_remainder - # t4: filter_y_offset / unaligned_filter_y_offset / tmp value - # t5: mac_shift - # t6: output_sar_byte / tmp value - - # a6(not for extension instructions): filter_n_offset - # a7(not for extension instructions): c_div_x_1 - # t0(not for extension instructions): filter_h / tmp value - # t1(not for extension instructions): filter_w - # t2(not for extension instructions): filter_w_rs1_1 - # s2(not for extension instructions): tmp value = 8 - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: bias_ptr - # s9: - # s10: - # s11: - - esp32p4_push_12_stacks_3r s2, s0, s8 - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_load_args a2, a3, a4, a5, t3, a7, t5, t1, t2 - addi a4, a4, -16 // a4: dilation_x_offset - 16 - addi a5, a5, -16 // a5: dilation_y_offset - 16 - lw a6, 144(a2) // a6: filter_n_offset - lw s8, 68(a2) // bias - - - bltz a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_c_remainder - esp.ld.128.usar.ip q0, a0, 0 - addi t3, t3, -16 // t3: next_hws1 - 16 - lw t4, 60(a2) // t4: filter_y_offset - esp.movx.r.sar.bytes t6 // t6: output_sar_byte - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_c_div_x: - beqz t6, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_128b - li s2, 8 - beq t6, s2, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_64b - # esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_32b: - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_32b_multiple_loop: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, q3, a1, a3, a4, a5, t3, t0, t1, t2, a2, t4, a6, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t6 - - addi a7, a7, -1 - bgez a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_32b_multiple_loop - j esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_64b: - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_64b_multiple_loop: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, q3, a1, a3, a4, a5, t3, t0, t1, t2, a2, t4, a6, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi a7, a7, -1 - bgez a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_64b_multiple_loop - j esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_128b: - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_128b_multiple_loop: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, q3, a1, a3, a4, a5, t3, t0, t1, t2, a2, t4, a6, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi a7, a7, -1 - bgez a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_128b_multiple_loop - - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_c_remainder: - lw t3, 136(a2) // t3: c_remainder - beqz t3, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_c_remainder_end - lw a3, 168(a2) // a3: unaligned_filter_ptr - lw t4, 160(a2) // t4: unaligned_filter_y_offset - addi a4, a4, 16 - addi a5, a5, 16 - sub a4, a4, t3 // a4: dilation_x_offset - c_remainder - sub a5, a5, t3 // a5: dilation_y_offset - c_remainder - - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_hwr1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t0, t1, t2, t3, a2, t4, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - dl_esp32p4_s8_store_remainder q0, a3, a4, a5, t4, t0, a0, t3 - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_c_remainder_end: - - esp32p4_pop_12_stacks_3r s2, s0, s8 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu - .type dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu: - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr / unaligned_filter_ptr / tmp value - # a4: input dilation x offset / tmp value - # a5: input dilation y offset / tmp value - # t3: next_hwx1 / c_remainder - # t4: filter_y_offset / unaligned_filter_y_offset / tmp value - # t5: mac_shift - # t6: output_sar_byte / tmp value - - # a6(not for extension instructions): filter_n_offset - # a7(not for extension instructions): c_div_x_1 - # t0(not for extension instructions): filter_h / tmp value - # t1(not for extension instructions): filter_w - # t2(not for extension instructions): filter_w_rs1_1 - # s2(not for extension instructions): tmp value = 8 - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: activation_alpha/_address - # s1: activation_shift - # s8: bias_ptr - # s9: - # s10: - # s11: - - esp32p4_push_20_stacks_5r s2, s0, s1, s8, s9 - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_load_args a2, a3, a4, a5, t3, a7, t5, t1, t2 - addi a4, a4, -16 // a4: dilation_x_offset - 16 - addi a5, a5, -16 // a5: dilation_y_offset - 16 - lw a6, 144(a2) // a6: filter_n_offset - lw s0, 76(a2) // activation_alpha - lw s1, 84(a2) // activation_shift - lw s8, 68(a2) // bias - - - bltz a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_c_remainder - esp.ld.128.usar.ip q0, a0, 0 - addi t3, t3, -16 // t3: next_hws1 - 16 - lw t4, 60(a2) // t4: filter_y_offset - esp.movx.r.sar.bytes t6 // t6: output_sar_byte - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_c_div_x: - beqz t6, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_128b - li s2, 8 - beq t6, s2, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_64b - # esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_32b: - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_32b_multiple_loop: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, q3, a1, a3, a4, a5, t3, t0, t1, t2, a2, t4, a6, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_128b_vector_relu q0, s0, s1 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t6 - - addi a7, a7, -1 - bgez a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_32b_multiple_loop - j esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_64b: - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_64b_multiple_loop: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, q3, a1, a3, a4, a5, t3, t0, t1, t2, a2, t4, a6, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_128b_vector_relu q0, s0, s1 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi a7, a7, -1 - bgez a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_64b_multiple_loop - j esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_128b: - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_128b_multiple_loop: - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, q3, a1, a3, a4, a5, t3, t0, t1, t2, a2, t4, a6, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_128b_vector_relu q0, s0, s1 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi a7, a7, -1 - bgez a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_128b_multiple_loop - - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_c_remainder: - lw t3, 136(a2) // t3: c_remainder - beqz t3, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_c_remainder_end - lw a3, 168(a2) // a3: unaligned_filter_ptr - lw t4, 160(a2) // t4: unaligned_filter_y_offset - addi a4, a4, 16 - addi a5, a5, 16 - sub a4, a4, t3 // a4: dilation_x_offset - c_remainder - sub a5, a5, t3 // a5: dilation_y_offset - c_remainder - - esp.zero.qacc - esp32p4_s8_conv2d_128b_vector_bias s8 - esp32p4_s8_unaligned_depthwise_conv2d_hwr1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t0, t1, t2, t3, a2, t4, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_128b_vector_relu q0, s0, s1 - dl_esp32p4_s8_store_remainder q0, a3, a4, a5, t4, t0, a0, t3 - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_bias_relu_c_remainder_end: - - esp32p4_pop_20_stacks_5r s2, s0, s1, s8, s9 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1 - .type dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1: - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr / unaligned_filter_ptr / tmp value - # a4: input dilation x offset / tmp value - # a5: input dilation y offset / tmp value - # t3: next_hwx1 / c_remainder - # t4: filter_y_offset / unaligned_filter_y_offset / tmp value - # t5: mac_shift - # t6: output_sar_byte / tmp value - - # a6(not for extension instructions): filter_n_offset - # a7(not for extension instructions): c_div_x_1 - # t0(not for extension instructions): filter_h / tmp value - # t1(not for extension instructions): filter_w - # t2(not for extension instructions): filter_w_rs1_1 - # s2(not for extension instructions): tmp value = 8 - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp32p4_push_4_stacks_1r s2 - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_load_args a2, a3, a4, a5, t3, a7, t5, t1, t2 - addi a4, a4, -16 // a4: dilation_x_offset - 16 - addi a5, a5, -16 // a5: dilation_y_offset - 16 - lw a6, 144(a2) // a6: filter_n_offset - - - bltz a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_c_remainder - esp.ld.128.usar.ip q0, a0, 0 - addi t3, t3, -16 // t3: next_hws1 - 16 - lw t4, 60(a2) // t4: filter_y_offset - esp.movx.r.sar.bytes t6 // t6: output_sar_byte - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_c_div_x: - beqz t6, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_128b - li s2, 8 - beq t6, s2, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_64b - # esp32p4_s8_unaligned_depthwise_conv2d_hwc1_32b: - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_32b_multiple_loop: - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, q3, a1, a3, a4, a5, t3, t0, t1, t2, a2, t4, a6, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t6 - - addi a7, a7, -1 - bgez a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_32b_multiple_loop - j esp32p4_s8_unaligned_depthwise_conv2d_hwc1_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_64b: - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_64b_multiple_loop: - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, q3, a1, a3, a4, a5, t3, t0, t1, t2, a2, t4, a6, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi a7, a7, -1 - bgez a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_64b_multiple_loop - j esp32p4_s8_unaligned_depthwise_conv2d_hwc1_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_128b: - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_128b_multiple_loop: - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, q3, a1, a3, a4, a5, t3, t0, t1, t2, a2, t4, a6, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi a7, a7, -1 - bgez a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_128b_multiple_loop - - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_c_remainder: - lw t3, 136(a2) // t3: c_remainder - beqz t3, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_c_remainder_end - lw a3, 168(a2) // a3: unaligned_filter_ptr - lw t4, 160(a2) // t4: unaligned_filter_y_offset - addi a4, a4, 16 - addi a5, a5, 16 - sub a4, a4, t3 // a4: dilation_x_offset - c_remainder - sub a5, a5, t3 // a5: dilation_y_offset - c_remainder - - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_hwr1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t0, t1, t2, t3, a2, t4, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - dl_esp32p4_s8_store_remainder q0, a3, a4, a5, t4, t0, a0, t3 - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_c_remainder_end: - - esp32p4_pop_4_stacks_1r s2 - ret - - - - .text - .align 2 - .global dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu - .type dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu, @function - .balign 4 - .option norvc -dl_esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu: - # a0: int8_t *output_ptr - # a1: int8_t *input_ptr - # a2: void *args - - # a3: int8_t *filter_ptr / unaligned_filter_ptr / tmp value - # a4: input dilation x offset / tmp value - # a5: input dilation y offset / tmp value - # t3: next_hwx1 / c_remainder - # t4: filter_y_offset / unaligned_filter_y_offset / tmp value - # t5: mac_shift - # t6: output_sar_byte / tmp value - - # a6(not for extension instructions): filter_n_offset - # a7(not for extension instructions): c_div_x_1 - # t0(not for extension instructions): filter_h / tmp value - # t1(not for extension instructions): filter_w - # t2(not for extension instructions): filter_w_rs1_1 - # s2(not for extension instructions): tmp value = 8 - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: activation_alpha/_address - # s1: activation_shift - # s8: - # s9: - # s10: - # s11: - - esp32p4_push_12_stacks_3r s2, s0, s1 - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_load_args a2, a3, a4, a5, t3, a7, t5, t1, t2 - addi a4, a4, -16 // a4: dilation_x_offset - 16 - addi a5, a5, -16 // a5: dilation_y_offset - 16 - lw a6, 144(a2) // a6: filter_n_offset - lw s0, 76(a2) // activation_alpha - lw s1, 84(a2) // activation_shift - - - bltz a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_c_remainder - esp.ld.128.usar.ip q0, a0, 0 - addi t3, t3, -16 // t3: next_hws1 - 16 - lw t4, 60(a2) // t4: filter_y_offset - esp.movx.r.sar.bytes t6 // t6: output_sar_byte - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_c_div_x: - beqz t6, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_128b - li s2, 8 - beq t6, s2, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_64b - # esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_32b: - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_32b_multiple_loop: - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, q3, a1, a3, a4, a5, t3, t0, t1, t2, a2, t4, a6, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_128b_vector_relu q0, s0, s1 - esp32p4_s8_32b_unaligned_vector_store q0, a0, t6 - - addi a7, a7, -1 - bgez a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_32b_multiple_loop - j esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_64b: - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_64b_multiple_loop: - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, q3, a1, a3, a4, a5, t3, t0, t1, t2, a2, t4, a6, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_128b_vector_relu q0, s0, s1 - esp32p4_s8_64b_unaligned_vector_store q0, a0 - - addi a7, a7, -1 - bgez a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_64b_multiple_loop - j esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_c_remainder - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_128b: - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_128b_multiple_loop: - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, q3, a1, a3, a4, a5, t3, t0, t1, t2, a2, t4, a6, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_128b_vector_relu q0, s0, s1 - esp32p4_s8_128b_aligned_vector_store q0, a0 - - addi a7, a7, -1 - bgez a7, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_128b_multiple_loop - - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_c_remainder: - lw t3, 136(a2) // t3: c_remainder - beqz t3, esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_c_remainder_end - lw a3, 168(a2) // a3: unaligned_filter_ptr - lw t4, 160(a2) // t4: unaligned_filter_y_offset - addi a4, a4, 16 - addi a5, a5, 16 - sub a4, a4, t3 // a4: dilation_x_offset - c_remainder - sub a5, a5, t3 // a5: dilation_y_offset - c_remainder - - esp.zero.qacc - esp32p4_s8_unaligned_depthwise_conv2d_hwr1 q0, q1, q2, q3, q4, q5, a1, a3, a4, a5, t0, t1, t2, t3, a2, t4, t6 - esp32p4_s8_128b_vector_shift_result q0, t5 - esp32p4_s8_128b_vector_relu q0, s0, s1 - dl_esp32p4_s8_store_remainder q0, a3, a4, a5, t4, t0, a0, t3 - - esp32p4_s8_unaligned_depthwise_conv2d_hwc1_relu_c_remainder_end: - - esp32p4_pop_12_stacks_3r s2, s0, s1 - ret \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16.S deleted file mode 100644 index e6ddd226..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16.S +++ /dev/null @@ -1,128 +0,0 @@ -.macro dl_tie728_128b_unaligned_store0 output_v, output_ptr, tmp32 - EE.MOVI.32.A \output_v, \tmp32, 0 - s32i \tmp32, \output_ptr, 0 - EE.MOVI.32.A \output_v, \tmp32, 1 - s32i \tmp32, \output_ptr, 4 - EE.MOVI.32.A \output_v, \tmp32, 2 - s32i \tmp32, \output_ptr, 8 - EE.MOVI.32.A \output_v, \tmp32, 3 - s32i \tmp32, \output_ptr, 12 - addi \output_ptr, \output_ptr, 16 -.endm - -.macro dl_tie728_128b_unaligned_store1 output_v, output_ptr - EE.VST.L.64.IP \output_v, \output_ptr, 8 - EE.VST.H.64.IP \output_v, \output_ptr, 8 -.endm - -.macro dl_tie728_128b_last_store_data tmp_q, output_v, tmp_a, c_remainder_bytes - beqi \c_remainder_bytes, 0, 600f - movi \tmp_a, 15 - sub \tmp_a, \tmp_a, \c_remainder_bytes - movi \c_remainder_bytes, 0 - EE.SLCXXP.2Q \tmp_q, \output_v, \tmp_a, \c_remainder_bytes #left shift to make the rest part 0 - EE.SRCXXP.2Q \output_v, \tmp_q, \tmp_a, \c_remainder_bytes #right shift to lower bits -600: -.endm - - -.macro dl_tie728_s16_store_aligned_remainder remainder_data, c_remainder, tmp_a, output_ptr -607: # remainder == 1, 0x111 - bbci \c_remainder, 2, 603f - bbci \c_remainder, 1, 605f - bbci \c_remainder, 0, 606f - EE.VST.L.64.IP \remainder_data, \output_ptr, 0 - EE.MOVI.32.A \remainder_data, \tmp_a, 2 - s32i \tmp_a, \output_ptr, 8 - EE.MOVI.32.A \remainder_data, \tmp_a, 3 - s16i \tmp_a, \output_ptr, 12 - j 600f -606: # remainder == 1, 0x110 - EE.VST.L.64.IP \remainder_data, \output_ptr, 0 - EE.MOVI.32.A \remainder_data, \tmp_a, 2 - s32i \tmp_a, \output_ptr, 8 - j 600f -605: # remainder == 1, 0x101 - bbci \c_remainder, 0, 604f - EE.VST.L.64.IP \remainder_data, \output_ptr, 0 - EE.MOVI.32.A \remainder_data, \tmp_a, 2 - s16i \tmp_a, \output_ptr, 8 - j 600f -604: # remainder == 1, 0x100 - EE.VST.L.64.IP \remainder_data, \output_ptr, 0 - j 600f -603: # remainder == 1, 0x011 - bbci \c_remainder, 1, 601f - bbci \c_remainder, 0, 602f - EE.MOVI.32.A \remainder_data, \tmp_a, 0 - s32i \tmp_a, \output_ptr, 0 - EE.MOVI.32.A \remainder_data, \tmp_a, 1 - s16i \tmp_a, \output_ptr, 4 - j 600f -602: # remainder == 1, 0x010 - EE.MOVI.32.A \remainder_data, \tmp_a, 0 - s32i \tmp_a, \output_ptr, 0 - j 600f -601: # remainder == 1, 0x001 - bbci \c_remainder, 0, 600f - EE.MOVI.32.A \remainder_data, \tmp_a, 0 - s16i \tmp_a, \output_ptr, 0 -600: -.endm - - -.macro dl_tie728_s16_store_remainder remainder_data, c_remainder, tmp_a, output_ptr -607: # remainder == 1, 0x111 - bbci \c_remainder, 2, 603f - bbci \c_remainder, 1, 605f - bbci \c_remainder, 0, 606f - EE.MOVI.32.A \remainder_data, \tmp_a, 0 - s32i \tmp_a, \output_ptr, 0 - EE.MOVI.32.A \remainder_data, \tmp_a, 1 - s32i \tmp_a, \output_ptr, 4 - EE.MOVI.32.A \remainder_data, \tmp_a, 2 - s32i \tmp_a, \output_ptr, 8 - EE.MOVI.32.A \remainder_data, \tmp_a, 3 - s16i \tmp_a, \output_ptr, 12 - j 600f -606: # remainder == 1, 0x110 - EE.MOVI.32.A \remainder_data, \tmp_a, 0 - s32i \tmp_a, \output_ptr, 0 - EE.MOVI.32.A \remainder_data, \tmp_a, 1 - s32i \tmp_a, \output_ptr, 4 - EE.MOVI.32.A \remainder_data, \tmp_a, 2 - s32i \tmp_a, \output_ptr, 8 - j 600f -605: # remainder == 1, 0x101 - bbci \c_remainder, 0, 604f - EE.MOVI.32.A \remainder_data, \tmp_a, 0 - s32i \tmp_a, \output_ptr, 0 - EE.MOVI.32.A \remainder_data, \tmp_a, 1 - s32i \tmp_a, \output_ptr, 4 - EE.MOVI.32.A \remainder_data, \tmp_a, 2 - s16i \tmp_a, \output_ptr, 8 - j 600f -604: # remainder == 1, 0x100 - EE.MOVI.32.A \remainder_data, \tmp_a, 0 - s32i \tmp_a, \output_ptr, 0 - EE.MOVI.32.A \remainder_data, \tmp_a, 1 - s32i \tmp_a, \output_ptr, 4 - j 600f -603: # remainder == 1, 0x011 - bbci \c_remainder, 1, 601f - bbci \c_remainder, 0, 602f - EE.MOVI.32.A \remainder_data, \tmp_a, 0 - s32i \tmp_a, \output_ptr, 0 - EE.MOVI.32.A \remainder_data, \tmp_a, 1 - s16i \tmp_a, \output_ptr, 4 - j 600f -602: # remainder == 1, 0x010 - EE.MOVI.32.A \remainder_data, \tmp_a, 0 - s32i \tmp_a, \output_ptr, 0 - j 600f -601: # remainder == 1, 0x001 - bbci \c_remainder, 0, 600f - EE.MOVI.32.A \remainder_data, \tmp_a, 0 - s16i \tmp_a, \output_ptr, 0 -600: -.endm \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_add2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_add2d.S deleted file mode 100644 index 34176cbd..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_add2d.S +++ /dev/null @@ -1,1441 +0,0 @@ -# include "dl_tie728_s16.S" - -############################################################################################################################################################ -#### -#### tie728_s16_add2d_11c series -#### -############################################################################################################################################################ - -.macro dl_tie728_s16_rescale_add_rescale_output input0, input1, output, output_scale, output_shift - EE.ZERO.QACC - EE.VMULAS.s16.QACC \input0, \output_scale - EE.VMULAS.s16.QACC \input1, \output_scale - EE.SRCMB.S16.QACC \output, \output_shift, 0 -.endm - - - .align 4 - .text - .global dl_tie728_s16_add2d_11c - .type dl_tie728_s16_add2d_11c, @function - # .section .iram1 -dl_tie728_s16_add2d_11c: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_2x_1 - # a7: c_left_x_1 - - l32i a6, a5, 68 - l32i a7, a5, 72 - - blti a6, 1, dl_tie728_s16_add2d_small_channel - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - loopgtz a6, 0f - EE.VLD.128.IP q2, a3, 16 - EE.VADDS.S16.LD.INCP q3, a4, q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VADDS.S16.LD.INCP q1, a4, q5, q2, q3 - EE.VST.128.IP q5, a2, 16 -0: - EE.VLD.128.IP q2, a3, 16 - EE.VADDS.S16.LD.INCP q3, a4, q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - beqi a7, 1, 2f #remainder == 2*16byte - beqi a7, 2, 3f #remainder == 3*16byte - -2: - EE.VADDS.S16 q5, q2, q3 - EE.VST.128.IP q5, a2, 16 - retw -3: - EE.VLD.128.IP q0, a3, 16 - EE.VADDS.S16.LD.INCP q1, a4, q5, q2, q3 - EE.VST.128.IP q5, a2, 16 - - EE.VADDS.S16 q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - retw - -dl_tie728_s16_add2d_small_channel: - blti a7, 0, 5f - loopgtz a7, 1f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VADDS.S16 q2, q0, q1 - EE.VST.128.IP q2, a2, 16 -1: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VADDS.S16 q2, q0, q1 - EE.VST.128.IP q2, a2, 16 -5: - retw - - - - .align 4 - .text - .global dl_tie728_s16_rescale_add2d_11c - .type dl_tie728_s16_rescale_add2d_11c, @function - # .section .iram1 -dl_tie728_s16_rescale_add2d_11c: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - - l32i a6, a5, 64 - l32i a7, a5, 88 - l32i a8, a5, 96 - l32i a9, a5, 92 - - beqi a8, 1, dl_tie728_s16_rescale_add2d_output - -dl_tie728_s16_rescale_add2d_output_scale: - s16i a8, a1, 0 - EE.VLDBC.16 q7, a1 # all output_scale - - # addi a6, a6, 1 - loopgtz a6, 1f - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_add_rescale_output q0, q1, q1, q7, a9 - - EE.VST.128.IP q1, a2, 16 - -1: - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S16.QACC q1, a7, 0 - dl_tie728_s16_rescale_add_rescale_output q0, q1, q1, q7, a9 - - EE.VST.128.IP q1, a2, 16 - retw - -dl_tie728_s16_rescale_add2d_output: - movi a13, 1 - s16i a13, a1, 0 - EE.VLDBC.16 q7, a1 # all 1 - - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - - loopgtz a6, 2f - EE.SRCMB.S16.QACC q1, a7, 0 - EE.VMULAS.S16.QACC.LD.IP q0, a3, 16, q0, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.LDQA.S16.128.IP a4, 16 - EE.VST.128.IP q1, a2, 16 -2: - - EE.SRCMB.S16.QACC q1, a7, 0 - EE.VMULAS.S16.QACC q0, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - - EE.VST.128.IP q1, a2, 16 - retw - - - .align 4 - .text - .global dl_tie728_s16_add2d_11c_relu - .type dl_tie728_s16_add2d_11c_relu, @function - # .section .iram1 -dl_tie728_s16_add2d_11c_relu: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_2x_1 - # a7: c_left_x - # a14: activation_alpha - # a15: activation_shift - - l32i a6, a5, 68 - l32i a7, a5, 72 - l32i a14, a5, 52 - l32i a15, a5, 60 - - - beqz a6, dl_tie728_s16_add2d_relu_small_channel - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - loopgtz a6, 0f - EE.VLD.128.IP q2, a3, 16 - EE.VADDS.S16.LD.INCP q3, a4, q4, q0, q1 - EE.VRELU.S16 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VADDS.S16.LD.INCP q1, a4, q5, q2, q3 - EE.VRELU.S16 q5, a14, a15 - EE.VST.128.IP q5, a2, 16 -0: - EE.VLD.128.IP q2, a3, 16 - EE.VADDS.S16.LD.INCP q3, a4, q4, q0, q1 - EE.VRELU.S16 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - beqi a7, 1, 2f #remainder == 2*16byte - beqi a7, 2, 3f #remainder == 3*16byte -2: - EE.VADDS.S16 q5, q2, q3 - EE.VRELU.S16 q5, a14, a15 - EE.VST.128.IP q5, a2, 16 - retw -3: - EE.VLD.128.IP q0, a3, 16 - EE.VADDS.S16.LD.INCP q1, a4, q5, q2, q3 - EE.VRELU.S16 q5, a14, a15 - EE.VST.128.IP q5, a2, 16 - - EE.VADDS.S16 q4, q0, q1 - EE.VRELU.S16 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - retw - -dl_tie728_s16_add2d_relu_small_channel: - blti a7, 0, 5f - loopgtz a7, 1f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VADDS.S16 q2, q0, q1 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 -1: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VADDS.S16 q2, q0, q1 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 -5: - retw - - - - .align 4 - .text - .global dl_tie728_s16_rescale_add2d_11c_relu - .type dl_tie728_s16_rescale_add2d_11c_relu, @function - # .section .iram1 -dl_tie728_s16_rescale_add2d_11c_relu: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a14: activation_alpha - # a15: activation_shift - - l32i a6, a5, 64 - l32i a7, a5, 88 - l32i a8, a5, 96 - l32i a9, a5, 92 - l32i a14, a5, 52 - l32i a15, a5, 60 - - beqi a8, 1, dl_tie728_s16_rescale_add2d_output_relu - -dl_tie728_s16_rescale_add2d_output_scale_relu: - s16i a8, a1, 0 - EE.VLDBC.16 q7, a1 # all output_scale - - loopgtz a6, 1f - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_add_rescale_output q0, q1, q1, q7, a9 - - EE.VRELU.S16 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - -1: - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_add_rescale_output q0, q1, q1, q7, a9 - - EE.VRELU.S16 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - retw - -dl_tie728_s16_rescale_add2d_output_relu: - movi a13, 1 - s16i a13, a1, 0 - EE.VLDBC.16 q7, a1 # all 1 - - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - - loopgtz a6, 2f - EE.SRCMB.S16.QACC q1, a7, 0 - EE.VMULAS.S16.QACC.LD.IP q0, a3, 16, q0, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.LDQA.S16.128.IP a4, 16 - EE.VRELU.S16 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 -2: - - EE.SRCMB.S16.QACC q1, a7, 0 - EE.VMULAS.S16.QACC q0, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.VRELU.S16 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - retw - - - - .align 4 - .text - .global dl_tie728_s16_add2d_11c_prelu - .type dl_tie728_s16_add2d_11c_prelu, @function - # .section .iram1 -dl_tie728_s16_add2d_11c_prelu: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_2x_1 - # a7: c_left_x - # a14: activation_alpha_ptr - # a15: activation_shift - - l32i a6, a5, 68 - l32i a7, a5, 72 - l32i a14, a5, 56 - l32i a15, a5, 60 - - beqz a6, dl_tie728_s16_add2d_prelu_small_channel - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - loopgtz a6, 0f - EE.VLD.128.IP q2, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S16.LD.INCP q3, a4, q4, q0, q1 - EE.VPRELU.S16 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S16.LD.INCP q1, a4, q5, q2, q3 - EE.VPRELU.S16 q5, q5, q6, a15 - EE.VST.128.IP q5, a2, 16 -0: - EE.VLD.128.IP q2, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S16.LD.INCP q3, a4, q4, q0, q1 - EE.VPRELU.S16 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - beqi a7, 1, 2f #remainder == 2*16byte - beqi a7, 2, 3f #remainder == 3*16byte - -2: - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S16 q5, q2, q3 - EE.VPRELU.S16 q5, q5, q6, a15 - EE.VST.128.IP q5, a2, 16 - retw -3: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S16.LD.INCP q1, a4, q5, q2, q3 - EE.VPRELU.S16 q5, q5, q6, a15 - EE.VST.128.IP q5, a2, 16 - - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S16 q4, q0, q1 - EE.VPRELU.S16 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - retw - -dl_tie728_s16_add2d_prelu_small_channel: - blti a7, 0, 5f - loopgtz a7, 1f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S16 q2, q0, q1 - EE.VPRELU.S16 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 -1: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S16 q2, q0, q1 - EE.VPRELU.S16 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 -5: - retw - - - - .align 4 - .text - .global dl_tie728_s16_rescale_add2d_11c_prelu - .type dl_tie728_s16_rescale_add2d_11c_prelu, @function - # .section .iram1 -dl_tie728_s16_rescale_add2d_11c_prelu: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a14: activation_alpha_ptr - # a15: activation_shift - - l32i a6, a5, 64 - l32i a7, a5, 88 - l32i a8, a5, 96 - l32i a9, a5, 92 - l32i a14, a5, 56 - l32i a15, a5, 60 - - beqi a8, 1, dl_tie728_s16_rescale_add2d_output_prelu - -dl_tie728_s16_rescale_add2d_output_scale_prelu: - s16i a8, a1, 0 - EE.VLDBC.16 q7, a1 # all output_scale - - # addi a6, a6, 1 - loopgtz a6, 1f - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_add_rescale_output q0, q1, q1, q7, a9 - - EE.VLD.128.IP q5, a14, 16 - - EE.VPRELU.S16 q1, q1, q5, a15 - EE.VST.128.IP q1, a2, 16 - -1: - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_add_rescale_output q0, q1, q1, q7, a9 - - EE.VLD.128.IP q5, a14, 16 - - EE.VPRELU.S16 q1, q1, q5, a15 - EE.VST.128.IP q1, a2, 16 - retw - -dl_tie728_s16_rescale_add2d_output_prelu: - movi a13, 1 - s16i a13, a1, 0 - EE.VLDBC.16 q7, a1 # all 1 - - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - - loopgtz a6, 2f - EE.SRCMB.S16.QACC q1, a7, 0 - EE.VMULAS.S16.QACC.LD.IP q0, a3, 16, q0, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - - EE.VLD.128.IP q6, a14, 16 - EE.LDQA.S16.128.IP a4, 16 - EE.VPRELU.S16 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 -2: - - EE.SRCMB.S16.QACC q1, a7, 0 - EE.VMULAS.S16.QACC q0, q7 - - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.VPRELU.S16 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 - retw - - -############################################################################################################################################################ -#### -#### tie728_s16_unaligned_add2d_11c series -#### -############################################################################################################################################################ - - .align 4 - .text - .global dl_tie728_s16_unaligned_add2d_11c - .type dl_tie728_s16_unaligned_add2d_11c, @function - # .section .iram1 -dl_tie728_s16_unaligned_add2d_11c: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a10: c_remainder - - l32i a6, a5, 64 - l32i a10, a5, 76 - l32i a7, a5, 88 - - bgei a7, 0, dl_tie728_s16_unaligned_rescale_add2d_11c - -# input0 exp = input1 exp = output exp - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie728_s16_unaligned_add2d_11c_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie728_s16_unaligned_add2d_11c_0 - beqi a13, 8, dl_tie728_s16_unaligned_add2d_11c_1 - - loopgtz a6, 0f #dl_tie728_s16_unaligned_add2d_11c - EE.SRC.Q.QUP q2, q0, q1 - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.VADDS.S16 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store0 q2, a2, a13 -0: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie728_s16_unaligned_add2d_11c_remainder - - #output sar = 0 -dl_tie728_s16_unaligned_add2d_11c_0: - loopgtz a6, 1f #dl_tie728_s16_unaligned_add2d_11c_loop0 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VST.128.IP q2, a2, 16 -1: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - EE.VST.128.IP q2, a2, 16 - j dl_tie728_s16_unaligned_add2d_11c_remainder - - # #output sar = 8 -dl_tie728_s16_unaligned_add2d_11c_1: - loopgtz a6, 2f #dl_tie728_s16_unaligned_add2d_11c_loop1 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store1 q2, a2 -2: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - dl_tie728_128b_unaligned_store1 q2, a2 - j dl_tie728_s16_unaligned_add2d_11c_remainder - -dl_tie728_s16_unaligned_add2d_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 - -dl_tie728_s16_unaligned_add2d_11c_remainder: - - beqz a10, dl_tie728_s16_unaligned_add2d_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - - srli a10, a10, 1 - dl_tie728_s16_store_remainder q2, a10, a13, a2 - -dl_tie728_s16_unaligned_add2d_end: - - retw - - -## rescaled add -dl_tie728_s16_unaligned_rescale_add2d_11c: - l32i a8, a5, 96 # output_scale - l32i a9, a5, 92 # output_shift - - beqi a8, 1, dl_tie728_s16_rescale_unaligned_add2d_output_shift - - -### rescaled to output by *scale) >> shift -dl_tie728_s16_rescale_unaligned_add2d_output_scale: - - s32i a8, a1, 0 - EE.VLDBC.16 q7, a1 # all output_scale - - blti a6, 0, dl_tie728_s16_rescale_unaligned_add2d_scale_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 3f #dl_tie728_s16_rescale_unaligned_add2d_11c_scale - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_add_rescale_output q2, q1, q2, q7, a9 - - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store0 q2, a2, a12 - 3: - - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_add_rescale_output q2, q1, q2, q7, a9 - - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie728_s16_rescale_unaligned_add2d_scale_remainder - - -dl_tie728_s16_rescale_unaligned_add2d_scale_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 #input1 sar - -dl_tie728_s16_rescale_unaligned_add2d_scale_remainder: - beqz a10, dl_tie728_s16_unaligned_rescale_add2d_output_scale_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_add_rescale_output q2, q1, q2, q7, a9 - - srli a10, a10, 1 - dl_tie728_s16_store_remainder q2, a10, a12, a2 - -dl_tie728_s16_unaligned_rescale_add2d_output_scale_end: - retw - - -### rescaled to output by right shift -dl_tie728_s16_rescale_unaligned_add2d_output_shift: - movi a13, 1 - s16i a13, a1, 0 - EE.VLDBC.16 q7, a1 # all 1 - - blti a6, 0, dl_tie728_s16_rescale_unaligned_add2d_shift_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 4f #dl_tie728_s16_rescale_unaligned_add2d_11c_shift - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S16.QACC q5 - - EE.SRCMB.S16.QACC q5, a7, 0 - EE.VMULAS.S16.QACC q2, q7 - EE.SRCMB.S16.QACC q5, a9, 0 - - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store0 q5, a2, a13 - 4: - addi a3, a3, -16 - add a3, a3, a10 - - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S16.QACC q5 - - EE.SRCMB.S16.QACC q5, a7, 0 - EE.VMULAS.S16.QACC q2, q7 - EE.SRCMB.S16.QACC q5, a9, 0 - - dl_tie728_128b_unaligned_store0 q5, a2, a13 - j dl_tie728_s16_rescale_unaligned_add2d_shift_remainder - - - -dl_tie728_s16_rescale_unaligned_add2d_shift_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 #input1 sar - -dl_tie728_s16_rescale_unaligned_add2d_shift_remainder: - beqz a10, dl_tie728_s16_unaligned_rescale_add2d_output_shift_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q5, a7, 0 - EE.VMULAS.S16.QACC q2, q7 - EE.SRCMB.S16.QACC q5, a9, 0 - - srli a10, a10, 1 - dl_tie728_s16_store_remainder q5, a10, a13, a2 - -dl_tie728_s16_unaligned_rescale_add2d_output_shift_end: - retw - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_add2d_11c_relu - .type dl_tie728_s16_unaligned_add2d_11c_relu, @function - # .section .iram1 -dl_tie728_s16_unaligned_add2d_11c_relu: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a10: c_remainder - # a14: activation_alpha - # a15: activation_shift - - l32i a6, a5, 64 - l32i a10, a5, 76 - l32i a7, a5, 88 - l32i a14, a5, 52 - l32i a15, a5, 60 - - bgei a7, 0, dl_tie728_s16_unaligned_rescale_add2d_11c_relu - -# input0 exp = input1 exp = output exp - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie728_s16_unaligned_add2d_11c_small_remainder_relu # channel < 8 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie728_s16_unaligned_add2d_11c_relu_0 - beqi a13, 8, dl_tie728_s16_unaligned_add2d_11c_relu_1 - - - loopgtz a6, 0f #dl_tie728_s16_unaligned_add2d_11c_relu - EE.SRC.Q.QUP q2, q0, q1 - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.VADDS.S16 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 -0: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie728_s16_unaligned_add2d_11c_remainder_relu - - #output sar = 0 -dl_tie728_s16_unaligned_add2d_11c_relu_0: - loopgtz a6, 1f #dl_tie728_s16_unaligned_add2d_11c_loop0_relu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 -1: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie728_s16_unaligned_add2d_11c_remainder_relu - - # #output sar = 8 -dl_tie728_s16_unaligned_add2d_11c_relu_1: - loopgtz a6, 2f #dl_tie728_s16_unaligned_add2d_11c_loop1_relu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store1 q2, a2 -2: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store1 q2, a2 - j dl_tie728_s16_unaligned_add2d_11c_remainder_relu - -dl_tie728_s16_unaligned_add2d_11c_small_remainder_relu: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 - -dl_tie728_s16_unaligned_add2d_11c_remainder_relu: - - beqz a10, dl_tie728_s16_unaligned_add2d_end_relu - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - EE.VRELU.S16 q2, a14, a15 - srli a10, a10, 1 - dl_tie728_s16_store_remainder q2, a10, a13, a2 - -dl_tie728_s16_unaligned_add2d_end_relu: - - retw - - -## rescaled add -dl_tie728_s16_unaligned_rescale_add2d_11c_relu: - l32i a8, a5, 96 # output_scale - l32i a9, a5, 92 # output_shift - - beqi a8, 1, dl_tie728_s16_rescale_unaligned_add2d_output_shift_relu - - -### rescaled to output by *scale) >> shift -dl_tie728_s16_rescale_unaligned_add2d_output_scale_relu: - - s32i a8, a1, 0 - EE.VLDBC.16 q7, a1 # all output_scale - - blti a6, 0, dl_tie728_s16_rescale_unaligned_add2d_scale_small_remainder_relu # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 3f #dl_tie728_s16_rescale_unaligned_add2d_11c_scale_relu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_add_rescale_output q2, q1, q2, q7, a9 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a12 - 3: - - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_add_rescale_output q2, q1, q2, q7, a9 - - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie728_s16_rescale_unaligned_add2d_scale_remainder_relu - - -dl_tie728_s16_rescale_unaligned_add2d_scale_small_remainder_relu: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 #input1 sar - -dl_tie728_s16_rescale_unaligned_add2d_scale_remainder_relu: - beqz a10, dl_tie728_s16_unaligned_rescale_add2d_output_scale_end_relu # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_add_rescale_output q2, q1, q2, q7, a9 - - EE.VRELU.S16 q2, a14, a15 - srli a10, a10, 1 - dl_tie728_s16_store_remainder q2, a10, a13, a2 - -dl_tie728_s16_unaligned_rescale_add2d_output_scale_end_relu: - retw - - -### rescaled to output by right shift -dl_tie728_s16_rescale_unaligned_add2d_output_shift_relu: - movi a13, 1 - s16i a13, a1, 0 - EE.VLDBC.16 q7, a1 # all 1 - - blti a6, 0, dl_tie728_s16_rescale_unaligned_add2d_shift_small_remainder_relu # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 4f #dl_tie728_s16_rescale_unaligned_add2d_11c_shift_relu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S16.QACC q5 - - EE.SRCMB.S16.QACC q5, a7, 0 - EE.VMULAS.S16.QACC q2, q7 - EE.SRCMB.S16.QACC q5, a9, 0 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q5, a14, a15 - dl_tie728_128b_unaligned_store0 q5, a2, a13 - 4: - addi a3, a3, -16 - add a3, a3, a10 - - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S16.QACC q5 - - EE.SRCMB.S16.QACC q5, a7, 0 - EE.VMULAS.S16.QACC q2, q7 - EE.SRCMB.S16.QACC q5, a9, 0 - - EE.VRELU.S16 q5, a14, a15 - dl_tie728_128b_unaligned_store0 q5, a2, a13 - j dl_tie728_s16_rescale_unaligned_add2d_shift_remainder_relu - - - -dl_tie728_s16_rescale_unaligned_add2d_shift_small_remainder_relu: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 #input1 sar - -dl_tie728_s16_rescale_unaligned_add2d_shift_remainder_relu: - beqz a10, dl_tie728_s16_unaligned_rescale_add2d_output_shift_end_relu # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q5, a7, 0 - EE.VMULAS.S16.QACC q2, q7 - EE.SRCMB.S16.QACC q5, a9, 0 - - EE.VRELU.S16 q5, a14, a15 - srli a10, a10, 1 - dl_tie728_s16_store_remainder q5, a10, a13, a2 - -dl_tie728_s16_unaligned_rescale_add2d_output_shift_end_relu: - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_add2d_11c_prelu - .type dl_tie728_s16_unaligned_add2d_11c_prelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_add2d_11c_prelu: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a10: c_remainder - # a14: activation_alpha_ptr - # a15: activation_shift - - l32i a6, a5, 64 - l32i a10, a5, 76 - l32i a7, a5, 88 - l32i a14, a5, 56 - l32i a15, a5, 60 - - bgei a7, 0, dl_tie728_s16_unaligned_rescale_add2d_11c_prelu - -# input0 exp = input1 exp = output exp - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie728_s16_unaligned_add2d_11c_small_remainder_prelu # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie728_s16_unaligned_add2d_11c_prelu_0 - beqi a13, 8, dl_tie728_s16_unaligned_add2d_11c_prelu_1 - - - loopgtz a6, 0f #dl_tie728_s16_unaligned_add2d_11c_prelu - EE.SRC.Q.QUP q2, q0, q1 - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.VADDS.S16 q2, q2, q5 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 -0: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S16 q2, q2, q5 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie728_s16_unaligned_add2d_11c_remainder_prelu - - #output sar = 0 -dl_tie728_s16_unaligned_add2d_11c_prelu_0: - loopgtz a6, 1f #dl_tie728_s16_unaligned_add2d_11c_loop0_prelu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 -1: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S16 q2, q2, q5 - EE.VPRELU.S16 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie728_s16_unaligned_add2d_11c_remainder_prelu - - # #output sar = 8 -dl_tie728_s16_unaligned_add2d_11c_prelu_1: - loopgtz a6, 2f #dl_tie728_s16_unaligned_add2d_11c_loop1_prelu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S16 q2, q2, q5 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store1 q2, a2 -2: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S16 q2, q2, q5 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store1 q2, a2 - j dl_tie728_s16_unaligned_add2d_11c_remainder_prelu - -dl_tie728_s16_unaligned_add2d_11c_small_remainder_prelu: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 - -dl_tie728_s16_unaligned_add2d_11c_remainder_prelu: - - beqz a10, dl_tie728_s16_unaligned_add2d_end_prelu - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S16 q2, q2, q5 - EE.VPRELU.S16 q2, q2, q6, a15 - srli a10, a10, 1 - dl_tie728_s16_store_remainder q2, a10, a13, a2 - -dl_tie728_s16_unaligned_add2d_end_prelu: - - retw - - -## rescaled add -dl_tie728_s16_unaligned_rescale_add2d_11c_prelu: - l32i a8, a5, 96 # output_scale - l32i a9, a5, 92 # output_shift - - beqi a8, 1, dl_tie728_s16_rescale_unaligned_add2d_output_shift_prelu - - -### rescaled to output by *scale) >> shift -dl_tie728_s16_rescale_unaligned_add2d_output_scale_prelu: - - s32i a8, a1, 0 - EE.VLDBC.16 q7, a1 # all output_scale - # ssr a9 #output shift - # movi a13, 0 - - blti a6, 0, dl_tie728_s16_rescale_unaligned_add2d_scale_small_remainder_prelu # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 3f #dl_tie728_s16_rescale_unaligned_add2d_11c_scale_prelu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_add_rescale_output q2, q1, q2, q7, a9 - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a12 - 3: - - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - EE.VLD.128.IP q6, a14, 16 - dl_tie728_s16_rescale_add_rescale_output q2, q1, q2, q7, a9 - - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie728_s16_rescale_unaligned_add2d_scale_remainder_prelu - - -dl_tie728_s16_rescale_unaligned_add2d_scale_small_remainder_prelu: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 #input1 sar - -dl_tie728_s16_rescale_unaligned_add2d_scale_remainder_prelu: - beqz a10, dl_tie728_s16_unaligned_rescale_add2d_output_scale_end_prelu # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - EE.VLD.128.IP q6, a14, 16 - dl_tie728_s16_rescale_add_rescale_output q2, q1, q2, q7, a9 - - EE.VPRELU.S16 q2, q2, q6, a15 - srli a10, a10, 1 - dl_tie728_s16_store_remainder q2, a10, a13, a2 - -dl_tie728_s16_unaligned_rescale_add2d_output_scale_end_prelu: - retw - - -### rescaled to output by right shift -dl_tie728_s16_rescale_unaligned_add2d_output_shift_prelu: - movi a13, 1 - s16i a13, a1, 0 - EE.VLDBC.16 q7, a1 # all 1 - - blti a6, 0, dl_tie728_s16_rescale_unaligned_add2d_shift_small_remainder_prelu # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 4f #dl_tie728_s16_rescale_unaligned_add2d_11c_shift_prelu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S16.QACC q5 - - EE.SRCMB.S16.QACC q5, a7, 0 - EE.VMULAS.S16.QACC q2, q7 - EE.SRCMB.S16.QACC q5, a9, 0 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q5, q5, q6, a15 - dl_tie728_128b_unaligned_store0 q5, a2, a13 - 4: - addi a3, a3, -16 - add a3, a3, a10 - - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S16.QACC q5 - - EE.SRCMB.S16.QACC q5, a7, 0 - EE.VMULAS.S16.QACC q2, q7 - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S16.QACC q5, a9, 0 - - EE.VPRELU.S16 q5, q5, q6, a15 - dl_tie728_128b_unaligned_store0 q5, a2, a13 - j dl_tie728_s16_rescale_unaligned_add2d_shift_remainder_prelu - - - -dl_tie728_s16_rescale_unaligned_add2d_shift_small_remainder_prelu: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 #input1 sar - -dl_tie728_s16_rescale_unaligned_add2d_shift_remainder_prelu: - beqz a10, dl_tie728_s16_unaligned_rescale_add2d_output_shift_end_prelu # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q5, a7, 0 - EE.VMULAS.S16.QACC q2, q7 - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S16.QACC q5, a9, 0 - - EE.VPRELU.S16 q5, q5, q6, a15 - srli a10, a10, 1 - dl_tie728_s16_store_remainder q5, a10, a13, a2 - -dl_tie728_s16_unaligned_rescale_add2d_output_shift_end_prelu: - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_avg_pool2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_avg_pool2d.S deleted file mode 100644 index 33001bf8..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_avg_pool2d.S +++ /dev/null @@ -1,533 +0,0 @@ -#include "dl_tie728_s16.S" - -############################################################################################################################################################ -#### -#### dl_tie728_s16_avg_pool2d series -#### -############################################################################################################################################################ - .align 4 - .text - .global dl_tie728_s16_avg_pool2d_22c1 - .type dl_tie728_s16_avg_pool2d_22c1, @function - .section .iram1 -dl_tie728_s16_avg_pool2d_22c1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - # a5: input_y_offset_bytes - # a6: input_x_offset_bytes - # a11: c_div_x_1 - # a14: mac_shift - - l32i a5, a4, 16 # input_y_offset_bytes - l32i a6, a4, 20 # input_x_offset_bytes - l32i a11, a4, 104 # c_div_x_1 - - add a7, a3, a6 - add a8, a3, a5 - add a9, a8, a6 - - - addi a14, a4, 64 - EE.VLDBC.16 q0, a14 # avg_pool_area_inv - l32i a14, a4, 56 # mac_shift - - EE.VLD.128.IP q1, a3, 16 - EE.VLD.128.IP q2, a7, 16 - loopgtz a11, 1f - EE.ZERO.QACC - EE.VMULAS.S16.QACC.LD.IP q3, a8, 16, q0, q1 - EE.VMULAS.S16.QACC.LD.IP q4, a9, 16, q0, q2 - EE.VMULAS.S16.QACC.LD.IP q1, a3, 16, q0, q3 - EE.VMULAS.S16.QACC.LD.IP q2, a7, 16, q0, q4 - EE.SRCMB.S16.QACC q7, a14, 0 - EE.VST.128.IP q7, a2, 16 -1: - EE.ZERO.QACC - EE.VMULAS.S16.QACC.LD.IP q3, a8, 16, q0, q1 - EE.VMULAS.S16.QACC.LD.IP q4, a9, 16, q0, q2 - EE.VMULAS.S16.QACC q0, q3 - EE.VMULAS.S16.QACC q0, q4 - EE.SRCMB.S16.QACC q7, a14, 0 - EE.VST.128.IP q7, a2, 0 - retw - - - .align 4 - .text - .global dl_tie728_s16_unaligned_avg_pool2d_22c1 - .type dl_tie728_s16_unaligned_avg_pool2d_22c1, @function - .section .iram1 -dl_tie728_s16_unaligned_avg_pool2d_22c1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - # a5: input_y_offset_bytes - # a6: input_x_offset_bytes - # a11: c_div_x_1 - # a12: c_remainder_bytes - # a13: mac_shift - - l32i a5, a4, 16 # input_y_offset_bytes - l32i a6, a4, 20 # input_x_offset_bytes - l32i a11, a4, 104 # c_div_x_1 - l32i a12, a4, 60 # c_remainder - l32i a13, a4, 56 # mac_shift - - add a7, a3, a6 - add a8, a3, a5 - add a9, a8, a6 - - blti a11, 0, dl_tie728_s16_unaligned_avg_pool2d_22c1_remainder - EE.LD.128.USAR.IP q7, a2, 0 - RUR.SAR_BYTE a15 - - addi a11, a11, 1 - - addi a14, a4, 64 - EE.VLDBC.16 q0, a14 # avg_pool_area_inv - - EE.LD.128.USAR.IP q1, a3, 16 - EE.LD.128.USAR.IP q2, a3, 0 - beqi a15, 0, 0f - beqi a15, 8, 8f - - loopgtz a11, 3f - EE.ZERO.QACC - - EE.SRC.Q.LD.IP q3, a7, 16, q1, q2 - EE.LD.128.USAR.IP q4, a7, 0 - EE.VMULAS.S16.QACC q0, q1 - - EE.SRC.Q.LD.IP q5, a8, 16, q3, q4 - EE.LD.128.USAR.IP q6, a8, 0 - EE.VMULAS.S16.QACC q0, q3 - - EE.SRC.Q.LD.IP q3, a9, 16, q5, q6 - EE.LD.128.USAR.IP q4, a9, 0 - EE.VMULAS.S16.QACC q0, q5 - - EE.SRC.Q.LD.IP q1, a3, 16, q3, q4 - EE.LD.128.USAR.IP q2, a3, 0 - EE.VMULAS.S16.QACC q0, q3 - - EE.SRCMB.S16.QACC q7, a13, 0 - dl_tie728_128b_unaligned_store0 q7, a2, a14 - -3: - j dl_tie728_s16_unaligned_avg_pool2d_22c1_remainder - -0: - loopgtz a11, 4f - EE.ZERO.QACC - - EE.SRC.Q.LD.IP q3, a7, 16, q1, q2 - EE.LD.128.USAR.IP q4, a7, 0 - EE.VMULAS.S16.QACC q0, q1 - - EE.SRC.Q.LD.IP q5, a8, 16, q3, q4 - EE.LD.128.USAR.IP q6, a8, 0 - EE.VMULAS.S16.QACC q0, q3 - - EE.SRC.Q.LD.IP q3, a9, 16, q5, q6 - EE.LD.128.USAR.IP q4, a9, 0 - EE.VMULAS.S16.QACC q0, q5 - - EE.SRC.Q.LD.IP q1, a3, 16, q3, q4 - EE.LD.128.USAR.IP q2, a3, 0 - EE.VMULAS.S16.QACC q0, q3 - - EE.SRCMB.S16.QACC q7, a13, 0 - EE.VST.128.IP q7, a2, 16 - -4: - j dl_tie728_s16_unaligned_avg_pool2d_22c1_remainder -8: - loopgtz a11, 5f - EE.ZERO.QACC - - EE.SRC.Q.LD.IP q3, a7, 16, q1, q2 - EE.LD.128.USAR.IP q4, a7, 0 - EE.VMULAS.S16.QACC q0, q1 - - EE.SRC.Q.LD.IP q5, a8, 16, q3, q4 - EE.LD.128.USAR.IP q6, a8, 0 - EE.VMULAS.S16.QACC q0, q3 - - EE.SRC.Q.LD.IP q3, a9, 16, q5, q6 - EE.LD.128.USAR.IP q4, a9, 0 - EE.VMULAS.S16.QACC q0, q5 - - EE.SRC.Q.LD.IP q1, a3, 16, q3, q4 - EE.LD.128.USAR.IP q2, a3, 0 - EE.VMULAS.S16.QACC q0, q3 - - EE.SRCMB.S16.QACC q7, a13, 0 - dl_tie728_128b_unaligned_store1 q7, a2 - -5: - -dl_tie728_s16_unaligned_avg_pool2d_22c1_remainder: - - beqz a12, 6f - EE.ZERO.QACC - EE.SRC.Q.LD.IP q3, a7, 16, q1, q2 - EE.LD.128.USAR.IP q4, a7, 0 - EE.VMULAS.S16.QACC q0, q1 - EE.SRC.Q.LD.IP q5, a8, 16, q3, q4 - EE.LD.128.USAR.IP q6, a8, 0 - EE.VMULAS.S16.QACC q0, q3 - EE.SRC.Q q5, q5, q6 - EE.LD.128.USAR.XP q3, a9, a12 - EE.VLD.128.IP q4, a9, 0 - EE.VMULAS.S16.QACC q0, q5 - EE.SRC.Q q3, q3, q4 - EE.VMULAS.S16.QACC q0, q3 - EE.SRCMB.S16.QACC q7, a13, 0 - srli a12, a12, 1 - dl_tie728_s16_store_remainder q7, a12, a14, a2 -6: - retw - - - - .align 4 - .text - .global dl_tie728_s16_avg_pool2d_hwc1 - .type dl_tie728_s16_avg_pool2d_hwc1, @function - .section .iram1 -dl_tie728_s16_avg_pool2d_hwc1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - # a5: input_y_offset - # a6: input_x_offset - # a11: c_div_x_1 - # a13: mac_shift - - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a8, a4, 48 # filter_height - l32i a9, a4, 52 # filter_width - l32i a11, a4, 104 # c_div_x_1 - l32i a13, a4, 56 # mac_shift - - addi a14, a4, 64 - EE.VLDBC.16 q0, a14 # avg_pool_area_inv - - srli a10, a9, 1 - addi a10, a10, -1 # filter_w / 2 - 1 - - blti a9, 2, dl_tie728_s16_avg_pool2d_h1c1_loop - blti a11, 1, dl_tie728_s16_avg_pool2d_hwc1_small_channel - -1: # loop c - mov a7, a3 - mov a14, a7 # input_ptr - mov a15, a8 # height - EE.ZERO.QACC - -2: # loop h - EE.VLD.128.XP q1, a14, a6 - EE.VLD.128.XP q2, a14, a6 - loopgtz a10, 3f - EE.VMULAS.S16.QACC.LD.XP q1, a14, a6, q0, q1 - EE.VMULAS.S16.QACC.LD.XP q2, a14, a6, q0, q2 -3: - bbci a9, 0, 4f - # w left 3 - EE.VMULAS.S16.QACC.LD.XP q1, a14, a6, q0, q1 - EE.VMULAS.S16.QACC q0, q2 - EE.VMULAS.S16.QACC q0, q1 - j 5f - -4: # w left 2 - EE.VMULAS.S16.QACC q0, q1 - EE.VMULAS.S16.QACC q0, q2 -5: - addi a15, a15, -1 - add a7, a7, a5 - mov a14, a7 - bnez a15, 2b -6: - EE.SRCMB.S16.QACC q7, a13, 0 - EE.VST.128.IP q7, a2, 16 - addi a3, a3, 16 - addi a11, a11, -1 - bnez a11, 1b - -dl_tie728_s16_avg_pool2d_hwc1_small_channel: - mov a7, a3 - mov a14, a7 # input_ptr - mov a15, a8 # height - EE.ZERO.QACC - -2: # loop h - EE.VLD.128.XP q1, a14, a6 - EE.VLD.128.XP q2, a14, a6 - loopgtz a10, 3f - EE.VMULAS.S16.QACC.LD.XP q1, a14, a6, q0, q1 - EE.VMULAS.S16.QACC.LD.XP q2, a14, a6, q0, q2 -3: - bbci a9, 0, 4f - # w left 3 - EE.VMULAS.S16.QACC.LD.XP q1, a14, a6, q0, q1 - EE.VMULAS.S16.QACC q0, q2 - EE.VMULAS.S16.QACC q0, q1 - j 5f - -4: # w left 2 - EE.VMULAS.S16.QACC q0, q1 - EE.VMULAS.S16.QACC q0, q2 -5: - addi a15, a15, -1 - add a7, a7, a5 - mov a14, a7 - bnez a15, 2b - EE.SRCMB.S16.QACC q7, a13, 0 - EE.VST.128.IP q7, a2, 0 - retw - -dl_tie728_s16_avg_pool2d_h1c1_loop: - blti a11, 1, dl_tie728_s16_max_pool2d_h1c1_small_channel -1: - mov a14, a3 - EE.ZERO.QACC - EE.VLD.128.XP q1, a14, a5 - loopgtz a8, 2f - EE.VMULAS.S16.QACC.LD.XP q1, a14, a5, q0, q1 -2: - EE.SRCMB.S16.QACC q7, a13, 0 - EE.VST.128.IP q7, a2, 16 - addi a3, a3, 16 - addi a11, a11, -1 - bnez a11, 1b - -dl_tie728_s16_max_pool2d_h1c1_small_channel: - mov a14, a3 - EE.ZERO.QACC - EE.VLD.128.XP q1, a14, a5 - loopgtz a8, 2f - EE.VMULAS.S16.QACC.LD.XP q1, a14, a5, q0, q1 -2: - EE.SRCMB.S16.QACC q7, a13, 0 - EE.VST.128.IP q7, a2, 16 - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_avg_pool2d_hwc1 - .type dl_tie728_s16_unaligned_avg_pool2d_hwc1, @function - .section .iram1 -dl_tie728_s16_unaligned_avg_pool2d_hwc1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - # a5: input_y_offset - # a6: input_x_offset - # a11: c_div_x_1 - # a12: c_remainder_bytes - # a13: mac_shift - - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a8, a4, 48 # filter_height - l32i a9, a4, 52 # filter_width - l32i a11, a4, 104 # c_div_x_1 - l32i a12, a4, 60 # c_remainder - l32i a13, a4, 56 # mac_shift - - addi a14, a4, 64 - EE.VLDBC.16 q0, a14 # avg_pool_area_inv - - srli a10, a9, 1 - addi a10, a10, -1 # filter_w / 2 - 1 - - addi a6, a6, -16 - addi a11, a11, 1 - EE.LD.128.USAR.IP q7, a2, 0 - RUR.SAR_BYTE a15 - - blti a9, 2, dl_tie728_s16_unaligned_avg_pool2d_h1c1_loop - blti a11, 1, dl_tie728_s16_unaligned_avg_pool2d_hwc1_small_channel - -1: # loop c - mov a7, a3 - mov a14, a7 # input_ptr - mov a4, a8 # height - EE.ZERO.QACC -2: # loop h - EE.LD.128.USAR.IP q1, a14, 16 - EE.LD.128.USAR.XP q2, a14, a6 - loopgtz a10, 3f - EE.SRC.Q.LD.IP q3, a14, 16, q1, q2 - EE.LD.128.USAR.XP q4, a14, a6 - EE.VMULAS.S16.QACC q0, q1 - - EE.SRC.Q.LD.IP q1, a14, 16, q3, q4 - EE.LD.128.USAR.XP q2, a14, a6 - EE.VMULAS.S16.QACC q0, q3 - -3: - bbci a9, 0, 4f - # w left 3 - EE.SRC.Q.LD.IP q3, a14, 16, q1, q2 - EE.LD.128.USAR.XP q4, a14, a6 - EE.VMULAS.S16.QACC q0, q1 - - EE.SRC.Q.LD.IP q1, a14, 16, q3, q4 - EE.LD.128.USAR.XP q2, a14, a6 - EE.VMULAS.S16.QACC q0, q3 - - EE.SRC.Q q1, q1, q2 - EE.VMULAS.S16.QACC q0, q1 - j 5f - -4: # w left 2 - EE.SRC.Q.LD.IP q3, a14, 16, q1, q2 - EE.LD.128.USAR.XP q4, a14, a6 - EE.VMULAS.S16.QACC q0, q1 - - EE.SRC.Q q3, q3, q4 - EE.VMULAS.S16.QACC q0, q3 -5: - addi a4, a4, -1 - add a7, a7, a5 - mov a14, a7 - bnez a4, 2b -6: - EE.SRCMB.S16.QACC q7, a13, 0 - - beqi a15, 0, 7f - beqi a15, 8, 8f - dl_tie728_128b_unaligned_store0 q7, a2, a14 - j 9f -7: - EE.VST.128.IP q7, a2, 16 - j 9f -8: - dl_tie728_128b_unaligned_store1 q7, a2 -9: - addi a3, a3, 16 - addi a11, a11, -1 - bnez a11, 1b - -dl_tie728_s16_unaligned_avg_pool2d_hwc1_small_channel: - beqz a12, 9f - - mov a7, a3 - mov a14, a7 # input_ptr - mov a4, a8 # height - EE.ZERO.QACC -2: # loop h - EE.LD.128.USAR.IP q1, a14, 16 - EE.LD.128.USAR.XP q2, a14, a6 - loopgtz a10, 3f - EE.SRC.Q.LD.IP q3, a14, 16, q1, q2 - EE.LD.128.USAR.XP q4, a14, a6 - EE.VMULAS.S16.QACC q0, q1 - - EE.SRC.Q.LD.IP q1, a14, 16, q3, q4 - EE.LD.128.USAR.XP q2, a14, a6 - EE.VMULAS.S16.QACC q0, q3 - -3: - bbci a9, 0, 4f - # w left 3 - EE.SRC.Q.LD.IP q3, a14, 16, q1, q2 - EE.LD.128.USAR.XP q4, a14, a6 - EE.VMULAS.S16.QACC q0, q1 - - EE.SRC.Q q3, q3, q4 - EE.LD.128.USAR.XP q1, a14, a12 - EE.VLD.128.IP q2, a14, 0 - EE.VMULAS.S16.QACC q0, q3 - - EE.SRC.Q q1, q1, q2 - EE.VMULAS.S16.QACC q0, q1 - j 5f - -4: # w left 2 - EE.SRC.Q q1, q1, q2 - EE.LD.128.USAR.XP q3, a14, a12 - EE.VLD.128.IP q4, a14, 0 - EE.VMULAS.S16.QACC q0, q1 - - EE.SRC.Q q3, q3, q4 - EE.VMULAS.S16.QACC q0, q3 -5: - addi a4, a4, -1 - add a7, a7, a5 - mov a14, a7 - bnez a4, 2b -6: - EE.SRCMB.S16.QACC q7, a13, 0 - srli a12, a12, 1 - dl_tie728_s16_store_remainder q7, a12, a14, a2 -9: - - retw - -dl_tie728_s16_unaligned_avg_pool2d_h1c1_loop: - addi a5, a5, -16 - blti a11, 1, dl_tie728_s16_unaligned_avg_pool2d_h1c1_small_channel -1: - mov a14, a3 - EE.ZERO.QACC - - loopgtz a8, 2f - EE.LD.128.USAR.IP q1, a14, 16 - EE.VLD.128.XP q2, a14, a5 - EE.SRC.Q q1, q1, q2 - EE.VMULAS.S16.QACC q0, q1 -2: - EE.SRCMB.S16.QACC q7, a13, 0 - beqi a15, 0, 3f - beqi a15, 8, 4f - dl_tie728_128b_unaligned_store0 q7, a2, a14 - j 5f -3: - EE.VST.128.IP q7, a2, 16 - j 5f -4: - dl_tie728_128b_unaligned_store1 q7, a2 -5: - addi a3, a3, 16 - addi a11, a11, -1 - bnez a11, 1b - -dl_tie728_s16_unaligned_avg_pool2d_h1c1_small_channel: - beqz a12, 5f -1: - mov a14, a3 - addi a5, a5, 16 - sub a5, a5, a12 - EE.ZERO.QACC - - loopgtz a8, 2f - EE.LD.128.USAR.XP q1, a14, a12 - EE.VLD.128.XP q2, a14, a5 - EE.SRC.Q q1, q1, q2 - EE.VMULAS.S16.QACC q0, q1 -2: - EE.SRCMB.S16.QACC q7, a13, 0 - srli a12, a12, 1 - dl_tie728_s16_store_remainder q7, a12, a14, a2 -5: - retw - - - diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_conv2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_conv2d.S deleted file mode 100644 index 611fce7e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_conv2d.S +++ /dev/null @@ -1,1128 +0,0 @@ -# TODO: put "result process" to "dl_tie728_s16.S" like "dl_tie728_s8.S" do -############################################################################################################################################################ -# result process -############################################################################################################################################################ -.macro tie728_s16_conv2d_shift_bias_relu_store output_v output_ptr mac_shift bias_v bias_ptr activation_alpha activation_shift - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - - # bias - EE.VADDS.S16 \output_v, \output_v, \bias_v - - # LeakyReLU - EE.VRELU.S16 \output_v, \activation_alpha, \activation_shift - - # store - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - - - -.macro tie728_s16_conv2d_shift_bias_prelu_store output_v output_ptr mac_shift bias_v bias_ptr activation_v activation_alpha_ptr activation_shift - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 16 # load PReLU alph - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - - # bias - EE.VADDS.S16 \output_v, \output_v, \bias_v - - # PReLU - EE.VPRELU.S16 \output_v, \output_v, \activation_v, \activation_shift - - # store - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - - - -.macro tie728_s16_conv2d_shift_bias_store output_v output_ptr mac_shift bias_v bias_ptr - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - - # bias - EE.VADDS.S16 \output_v, \output_v, \bias_v - - # store - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - - - -.macro tie728_s16_conv2d_shift_relu_store output_v output_ptr mac_shift activation_alpha activation_shift - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - - # LeakyReLU - EE.VRELU.S16 \output_v, \activation_alpha, \activation_shift - - # store - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - - - -.macro tie728_s16_conv2d_shift_prelu_store output_v output_ptr mac_shift activation_v activation_alpha_ptr activation_shift - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 16 # load PReLU alph - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - - # PReLU - EE.VPRELU.S16 \output_v, \output_v, \activation_v, \activation_shift - - # store - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - - - -.macro tie728_s16_conv2d_shift_store output_v output_ptr mac_shift - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - - # store - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -############################################################################################################################################################ -#### -#### tie728_s16_conv2d_11cn series -#### -############################################################################################################################################################ -.macro tie728_s16_conv2d_11c8 input_v0 filter_v0 filter_v1 input_ptr filter_ptr c_div_x_1 - # scalar * vecter and accumulate into QACC - # input_ptr += (c_div_x_1 + 1) * 16 in the end - # filter_ptr point to the next 16 bytes in the end - - # input_v0: 8 input elements - # filter_v0: 8 filter elements - # filter_v1: 8 filter elements - # input_ptr: input_ptr - # filter_ptr: filter_ptr - # c_div_x_1: input_channel // 8 - 1 - - EE.VLD.128.IP \input_v0, \input_ptr, 16 - EE.VLD.128.IP \filter_v0, \filter_ptr, 16 - EE.VLD.128.IP \filter_v1, \filter_ptr, 16 - - loopgtz \c_div_x_1, 0f - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - EE.VSMULAS.S16.QACC.LD.INCP \input_v0, \input_ptr, \filter_v1, \input_v0, 7 - EE.VLD.128.IP \filter_v1, \filter_ptr, 16 -0: - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S16.QACC \filter_v0, \input_v0, 6 - EE.VSMULAS.S16.QACC \filter_v1, \input_v0, 7 -.endm - - - - -.macro tie728_s16_conv2d_11cn_load_args args filter_ptr c_div_x_1 n_rs3 mac_shift - l32i \n_rs3, \args, 96 // output_channel_div_8 - l32i \mac_shift, \args, 64 // mac_shift - l32i \filter_ptr, \args, 48 // filter - l32i \c_div_x_1, \args, 100 // input_channel / x - 1 -.endm - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_11cn_bias - .type dl_tie728_s16_conv2d_11cn_bias, @function - # .section .iram1 -dl_tie728_s16_conv2d_11cn_bias: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: - # a10: - # a11: bias_ptr - # a12: activation_alpha/_address - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s16_conv2d_11cn_load_args a4, a5, a6, a7, a8 - - l32i a11, a4, 68 // bias - # l32i a12, a4, 76 // activation_alpha - # l32i a13, a4, 84 // activation_shift - -tie728_s16_conv2d_11cn_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_11c8 q0, q1, q2, a15, a5, a6 - tie728_s16_conv2d_shift_bias_store q0, a2, a8, q1, a11 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_11cn_bias_loop - - retw - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_11cn_bias_relu - .type dl_tie728_s16_conv2d_11cn_bias_relu, @function - # .section .iram1 -dl_tie728_s16_conv2d_11cn_bias_relu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: - # a10: - # a11: bias_ptr - # a12: activation_alpha/_address - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s16_conv2d_11cn_load_args a4, a5, a6, a7, a8 - - l32i a11, a4, 68 // bias - l32i a12, a4, 76 // activation_alpha - l32i a13, a4, 84 // activation_shift - -tie728_s16_conv2d_11cn_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_11c8 q0, q1, q2, a15, a5, a6 - tie728_s16_conv2d_shift_bias_relu_store q0, a2, a8, q1, a11, a12, a13 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_11cn_bias_relu_loop - - retw - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_11cn_bias_prelu - .type dl_tie728_s16_conv2d_11cn_bias_prelu, @function - # .section .iram1 -dl_tie728_s16_conv2d_11cn_bias_prelu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: - # a10: - # a11: bias_ptr - # a12: activation_alpha/_address - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s16_conv2d_11cn_load_args a4, a5, a6, a7, a8 - - l32i a11, a4, 68 // bias - l32i a12, a4, 80 // activation_alpha_ptr - l32i a13, a4, 84 // activation_shift - -tie728_s16_conv2d_11cn_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_11c8 q0, q1, q2, a15, a5, a6 - tie728_s16_conv2d_shift_bias_prelu_store q0, a2, a8, q1, a11, q2, a12, a13 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_11cn_bias_prelu_loop - - retw - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_11cn - .type dl_tie728_s16_conv2d_11cn, @function - # .section .iram1 -dl_tie728_s16_conv2d_11cn: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: - # a10: - # a11: bias_ptr - # a12: activation_alpha/_address - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s16_conv2d_11cn_load_args a4, a5, a6, a7, a8 - - # l32i a11, a4, 68 // bias - # l32i a12, a4, 76 // activation_alpha - # l32i a13, a4, 84 // activation_shift - -tie728_s16_conv2d_11cn_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_11c8 q0, q1, q2, a15, a5, a6 - tie728_s16_conv2d_shift_store q0, a2, a8 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_11cn_loop - - retw - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_11cn_relu - .type dl_tie728_s16_conv2d_11cn_relu, @function - # .section .iram1 -dl_tie728_s16_conv2d_11cn_relu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: - # a10: - # a11: bias_ptr - # a12: activation_alpha/_address - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s16_conv2d_11cn_load_args a4, a5, a6, a7, a8 - - # l32i a11, a4, 68 // bias - l32i a12, a4, 76 // activation_alpha - l32i a13, a4, 84 // activation_shift - -tie728_s16_conv2d_11cn_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_11c8 q0, q1, q2, a15, a5, a6 - tie728_s16_conv2d_shift_relu_store q0, a2, a8, a12, a13 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_11cn_relu_loop - - retw - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_11cn_prelu - .type dl_tie728_s16_conv2d_11cn_prelu, @function - # .section .iram1 -dl_tie728_s16_conv2d_11cn_prelu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: - # a10: - # a11: bias_ptr - # a12: activation_alpha/_address - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s16_conv2d_11cn_load_args a4, a5, a6, a7, a8 - - # l32i a11, a4, 68 // bias - l32i a12, a4, 80 // activation_alpha_ptr - l32i a13, a4, 84 // activation_shift - -tie728_s16_conv2d_11cn_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_11c8 q0, q1, q2, a15, a5, a6 - tie728_s16_conv2d_shift_prelu_store q0, a2, a8, q1, a12, a13 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_11cn_prelu_loop - - retw - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -############################################################################################################################################################ -#### -#### tie728_s16_conv2d_33cn series -#### -############################################################################################################################################################ -.macro tie728_s16_conv2d_33c8 input_v0 filter_v0 filter_v1 input_ptr filter_ptr c_div_x_1 dilation_x_offset dilation_y_offset - # dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(output_t) - # dilation_y_offset = (dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1)) * sizeof(output_t) - - tie728_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_y_offset - - tie728_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_y_offset - - tie728_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - # add \input_ptr, \input_ptr, \dilation_y_offset -.endm - - - -.macro tie728_s16_conv2d_hwcn_load_args args filter_ptr c_div_x_1 n_rs3 mac_shift dilation_x_offset dilation_y_offset - tie728_s16_conv2d_11cn_load_args \args, \filter_ptr, \c_div_x_1, \n_rs3, \mac_shift - l32i \dilation_x_offset, \args, 108 // input dilation x offset - l32i \dilation_y_offset, \args, 112 // input dilation y offset -.endm - - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_33cn_bias - .type dl_tie728_s16_conv2d_33cn_bias, @function - # .section .iram1 -dl_tie728_s16_conv2d_33cn_bias: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: bias_ptr - # a12: - # a13: - # a14 - # a15: moving_input_ptr - tie728_s16_conv2d_hwcn_load_args a4, a5, a6, a7, a8, a9, a10 - - l32i a11, a4, 68 // bias - # l32i a12, a4, 76 // activation_alpha - # l32i a13, a4, 84 // activation_shift - -tie728_s16_conv2d_33cn_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_33c8 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s16_conv2d_shift_bias_store q0, a2, a8, q1, a11 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_33cn_bias_loop - - retw - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_33cn_bias_relu - .type dl_tie728_s16_conv2d_33cn_bias_relu, @function - # .section .iram1 -dl_tie728_s16_conv2d_33cn_bias_relu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: bias_ptr - # a12: activation_alpha - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s16_conv2d_hwcn_load_args a4, a5, a6, a7, a8, a9, a10 - - l32i a11, a4, 68 // bias - l32i a12, a4, 76 // activation_alpha - l32i a13, a4, 84 // activation_shift - -tie728_s16_conv2d_33cn_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_33c8 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s16_conv2d_shift_bias_relu_store q0, a2, a8, q1, a11, a12, a13 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_33cn_bias_relu_loop - - retw - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_33cn_bias_prelu - .type dl_tie728_s16_conv2d_33cn_bias_prelu, @function - # .section .iram1 -dl_tie728_s16_conv2d_33cn_bias_prelu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: bias_ptr - # a12: activation_alpha_ptr - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s16_conv2d_hwcn_load_args a4, a5, a6, a7, a8, a9, a10 - - l32i a11, a4, 68 // bias - l32i a12, a4, 80 // activation_alpha_ptr - l32i a13, a4, 84 // activation_shift - -tie728_s16_conv2d_33cn_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_33c8 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s16_conv2d_shift_bias_prelu_store q0, a2, a8, q1, a11, q2, a12, a13 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_33cn_bias_prelu_loop - - retw - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_33cn - .type dl_tie728_s16_conv2d_33cn, @function - # .section .iram1 -dl_tie728_s16_conv2d_33cn: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: - # a12: - # a13: - # a14: - # a15: moving_input_ptr - tie728_s16_conv2d_hwcn_load_args a4, a5, a6, a7, a8, a9, a10 - - # l32i a11, a4, 68 // bias - # l32i a12, a4, 76 // activation_alpha - # l32i a13, a4, 84 // activation_shift - -tie728_s16_conv2d_33cn_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_33c8 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s16_conv2d_shift_store q0, a2, a8 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_33cn_loop - - retw - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_33cn_relu - .type dl_tie728_s16_conv2d_33cn_relu, @function - # .section .iram1 -dl_tie728_s16_conv2d_33cn_relu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: - # a12: activation_alpha/_address - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s16_conv2d_hwcn_load_args a4, a5, a6, a7, a8, a9, a10 - - # l32i a11, a4, 68 // bias - l32i a12, a4, 76 // activation_alpha - l32i a13, a4, 84 // activation_shift - -tie728_s16_conv2d_33cn_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_33c8 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s16_conv2d_shift_relu_store q0, a2, a8, a12, a13 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_33cn_relu_loop - - retw - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_33cn_prelu - .type dl_tie728_s16_conv2d_33cn_prelu, @function - # .section .iram1 -dl_tie728_s16_conv2d_33cn_prelu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: - # a12: activation_alpha_ptr - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s16_conv2d_hwcn_load_args a4, a5, a6, a7, a8, a9, a10 - - # l32i a11, a4, 68 // bias - l32i a12, a4, 80 // activation_alpha_ptr - l32i a13, a4, 84 // activation_shift - -tie728_s16_conv2d_33cn_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_33c8 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s16_conv2d_shift_prelu_store q0, a2, a8, q1, a12, a13 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_33cn_prelu_loop - - retw - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -############################################################################################################################################################ -#### -#### tie728_s16_conv2d_hwcn series -#### -############################################################################################################################################################ -.macro tie728_s16_conv2d_hwc8 input_v0 filter_v0 filter_v1 input_ptr filter_ptr c_div_x_1 dilation_x_offset dilation_y_offset filter_h filter_w args filter_offset_q - # dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(output_t) - # dilation_y_offset = (dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1)) * sizeof(output_t) - # filter_h - # filter_w - - l32i \filter_h, \args, 52 # filter_height - 1: - l32i \filter_w, \args, 56 # filter_width - beqi \filter_w, 1, 3f - 2: - tie728_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - addi \filter_w, \filter_w, -1 - bgei \filter_w, 2, 2b - 3: - tie728_s16_conv2d_11c8 \input_v0, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1 - - EE.MOVI.32.A \filter_offset_q, \filter_w, 1 - add \filter_ptr, \filter_ptr, \filter_w - add \input_ptr, \input_ptr, \dilation_y_offset - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 1b - - EE.MOVI.32.A \filter_offset_q, \filter_h, 2 - add \filter_ptr, \filter_ptr, \filter_h - -.endm - - - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_hwcn_bias - .type dl_tie728_s16_conv2d_hwcn_bias, @function - # .section .iram1 -dl_tie728_s16_conv2d_hwcn_bias: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: filter_height - # a12: filter_width - # a13: bias_ptr - # a14: - # a15: moving_input_ptr - tie728_s16_conv2d_hwcn_load_args a4, a5, a6, a7, a8, a9, a10 - l32i a12, a4, 60 # filter_y_offset - l32i a11, a4, 144 - EE.MOVI.32.Q q6, a12, 1 #filter_y_offset - EE.MOVI.32.Q q6, a11, 2 #filter_n_offset - - l32i a13, a4, 68 // bias - -tie728_s16_conv2d_hwcn_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_hwc8 q0, q1, q2, a15, a5, a6, a9, a10, a11, a12, a4, q6 - tie728_s16_conv2d_shift_bias_store q0, a2, a8, q1, a13 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_hwcn_bias_loop - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_hwcn_bias_relu - .type dl_tie728_s16_conv2d_hwcn_bias_relu, @function - # .section .iram1 -dl_tie728_s16_conv2d_hwcn_bias_relu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: filter_height - # a12: filter_width - # a13: bias_ptr - # a14: activation_alpha - # a15: moving_input_ptr - tie728_s16_conv2d_hwcn_load_args a4, a5, a6, a7, a8, a9, a10 - - l32i a12, a4, 60 # filter_y_offset - l32i a11, a4, 144 - EE.MOVI.32.Q q6, a12, 1 #filter_y_offset - EE.MOVI.32.Q q6, a11, 2 #filter_n_offset - - l32i a13, a4, 68 // bias - l32i a14, a4, 76 // activation_alpha - -tie728_s16_conv2d_hwcn_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_hwc8 q0, q1, q2, a15, a5, a6, a9, a10, a11, a12, a4, q6 - l32i a11, a4, 84 // activation_shift - tie728_s16_conv2d_shift_bias_relu_store q0, a2, a8, q1, a13, a14, a11 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_hwcn_bias_relu_loop - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_hwcn_bias_prelu - .type dl_tie728_s16_conv2d_hwcn_bias_prelu, @function - # .section .iram1 -dl_tie728_s16_conv2d_hwcn_bias_prelu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: filter_height - # a12: filter_width - # a13: bias_ptr - # a14: activation_alpha_ptr - # a15: moving_input_ptr - tie728_s16_conv2d_hwcn_load_args a4, a5, a6, a7, a8, a9, a10 - - l32i a12, a4, 60 # filter_y_offset - l32i a11, a4, 144 - EE.MOVI.32.Q q6, a12, 1 #filter_y_offset - EE.MOVI.32.Q q6, a11, 2 #filter_n_offset - - l32i a13, a4, 68 // bias_ptr - l32i a14, a4, 80 // activation_alpha_ptr - -tie728_s16_conv2d_hwcn_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_hwc8 q0, q1, q2, a15, a5, a6, a9, a10, a11, a12, a4, q6 - l32i a11, a4, 84 // activation_shift - tie728_s16_conv2d_shift_bias_prelu_store q0, a2, a8, q1, a13, q2, a14, a11 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_hwcn_bias_prelu_loop - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_hwcn - .type dl_tie728_s16_conv2d_hwcn, @function - # .section .iram1 -dl_tie728_s16_conv2d_hwcn: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: filter_height - # a12: filter_width - # a13: - # a14: - # a15: moving_input_ptr - - tie728_s16_conv2d_hwcn_load_args a4, a5, a6, a7, a8, a9, a10 - l32i a12, a4, 60 # filter_y_offset - l32i a11, a4, 144 - EE.MOVI.32.Q q6, a12, 1 #filter_y_offset - EE.MOVI.32.Q q6, a11, 2 #filter_n_offset - - # l32i a13, a4, 68 // bias - -tie728_s16_conv2d_hwcn_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_hwc8 q0, q1, q2, a15, a5, a6, a9, a10, a11, a12, a4, q6 - tie728_s16_conv2d_shift_store q0, a2, a8 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_hwcn_loop - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_hwcn_relu - .type dl_tie728_s16_conv2d_hwcn_relu, @function - # .section .iram1 -dl_tie728_s16_conv2d_hwcn_relu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: filter_height - # a12: filter_width - # a13: activation_alpha - # a14: activation_shift - # a15: moving_input_ptr - - tie728_s16_conv2d_hwcn_load_args a4, a5, a6, a7, a8, a9, a10 - l32i a12, a4, 60 # filter_y_offset - l32i a11, a4, 144 - EE.MOVI.32.Q q6, a12, 1 #filter_y_offset - EE.MOVI.32.Q q6, a11, 2 #filter_n_offset - - l32i a13, a4, 76 // activation_alpha - l32i a14, a4, 84 // activation_shift - -tie728_s16_conv2d_hwcn_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_hwc8 q0, q1, q2, a15, a5, a6, a9, a10, a11, a12, a4, q6 - tie728_s16_conv2d_shift_relu_store q0, a2, a8, a13, a14 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_hwcn_relu_loop - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_conv2d_hwcn_prelu - .type dl_tie728_s16_conv2d_hwcn_prelu, @function - # .section .iram1 -dl_tie728_s16_conv2d_hwcn_prelu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: filter_height - # a12: filter_width - # a13: activation_prelu_ptr - # a14: activation_shift - # a15: moving_input_ptr - - tie728_s16_conv2d_hwcn_load_args a4, a5, a6, a7, a8, a9, a10 - l32i a12, a4, 60 # filter_y_offset - l32i a11, a4, 144 - EE.MOVI.32.Q q6, a12, 1 #filter_y_offset - EE.MOVI.32.Q q6, a11, 2 #filter_n_offset - - l32i a13, a4, 80 // activation_alpha_ptr - l32i a14, a4, 84 // activation_shift - -tie728_s16_conv2d_hwcn_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s16_conv2d_hwc8 q0, q1, q2, a15, a5, a6, a9, a10, a11, a12, a4, q6 - tie728_s16_conv2d_shift_prelu_store q0, a2, a8, q1, a13, a14 - - addi a7, a7, -1 - bnez a7, tie728_s16_conv2d_hwcn_prelu_loop - - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_depthwise_conv2d_block.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_depthwise_conv2d_block.S deleted file mode 100644 index d1758949..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_depthwise_conv2d_block.S +++ /dev/null @@ -1,860 +0,0 @@ -# TODO: put "result process" to "dl_tie728_s16.S" like "dl_tie728_s8.S" do -############################################################################################################################################################ -# result process -############################################################################################################################################################ -.macro tie728_s16_depthwise_conv2d_shift_bias_relu_store output_v output_ptr mac_shift bias_v bias_ptr activation_alpha activation_shift - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - - # bias - EE.VADDS.S16 \output_v, \output_v, \bias_v - - # LeakyReLU - EE.VRELU.S16 \output_v, \activation_alpha, \activation_shift - - # store - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - - - -.macro tie728_s16_depthwise_conv2d_shift_bias_prelu_store output_v output_ptr mac_shift bias_v bias_ptr activation_v activation_alpha_ptr activation_shift - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 16 # load PReLU alph - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - - # bias - EE.VADDS.S16 \output_v, \output_v, \bias_v - - # PReLU - EE.VPRELU.S16 \output_v, \output_v, \activation_v, \activation_shift - - # store - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - - - -.macro tie728_s16_depthwise_conv2d_shift_bias_store output_v output_ptr mac_shift bias_v bias_ptr - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - - # bias - EE.VADDS.S16 \output_v, \output_v, \bias_v - - # store - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - - - -.macro tie728_s16_depthwise_conv2d_shift_relu_store output_v output_ptr mac_shift activation_alpha activation_shift - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - - # LeakyReLU - EE.VRELU.S16 \output_v, \activation_alpha, \activation_shift - - # store - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - - - -.macro tie728_s16_depthwise_conv2d_shift_prelu_store output_v output_ptr mac_shift activation_v activation_alpha_ptr activation_shift - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 16 # load PReLU alph - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - - # PReLU - EE.VPRELU.S16 \output_v, \output_v, \activation_v, \activation_shift - - # store - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - - - -.macro tie728_s16_depthwise_conv2d_shift_store output_v output_ptr mac_shift - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - - # store - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - - - - - -############################################################################################################################################################ -#### -#### tie728_s16_depthwise_conv2d_33c1 series -#### -############################################################################################################################################################ -.macro tie728_s16_depthwise_conv2d_3381 input_v0 filter_v0 input_v1 filter_v1 input_v2 filter_v2 input_ptr filter_ptr dilation_x_offset dilation_y_offset next_hw81 - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - # next_hw81 = (-(filter_width - 1) * dilation_x * input_channel_with_padding - (filter_height - 1) * dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) + 16 - - EE.ZERO.QACC - - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v2, \input_ptr, \dilation_y_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v2, \input_ptr, \dilation_y_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v2, \input_ptr, \next_hw81 - - EE.VMULAS.S16.QACC.LD.IP \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset -.endm - - - -.macro tie728_s16_depthwise_conv2d_3381_last input_v0 filter_v0 input_v1 filter_v1 input_ptr filter_ptr dilation_x_offset dilation_y_offset - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - - EE.ZERO.QACC - - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_y_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_y_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.IP \input_v0, \input_ptr, 0 - - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - # block one cycle here - - EE.VMULAS.S16.QACC \input_v0, \filter_v0 -.endm - - - -.macro tie728_s16_depthwise_conv2d_33c1_load_args args filter_ptr dilation_x_offset dilation_y_offset next_hw81 c_div_x_1 mac_shift - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - # next_hw81 = (-(filter_width - 1) * dilation_x * input_channel_with_padding - (filter_height - 1) * dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) + 16 - - l32i \filter_ptr, \args, 48 - l32i \dilation_x_offset, \args, 124 - l32i \dilation_y_offset, \args, 128 - l32i \next_hw81, \args, 132 - l32i \c_div_x_1, \args, 100 - l32i \mac_shift, \args, 64 -.endm - - - - .align 4 - .text - .global dl_tie728_s16_depthwise_conv2d_33c1_bias - .type dl_tie728_s16_depthwise_conv2d_33c1_bias, @function - # .section .iram1 -dl_tie728_s16_depthwise_conv2d_33c1_bias: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hw81 - # a9: c_div_x_1 - # a10: mac_shift - # a11: bias_ptr - # a12: - # a13: - # a14: - # a15: - tie728_s16_depthwise_conv2d_33c1_load_args a4, a5, a6, a7, a8, a9, a10 - - l32i a11, a4, 68 // bias_ptr - # l32i a12, a4, 76 // activation_alpha - # l32i a13, a4, 84 // activation_shift - - EE.VLD.128.XP q0, a3, a6 - EE.VLD.128.IP q1, a5, 16 - EE.VLD.128.XP q2, a3, a6 - - loopgtz a9, 1f - tie728_s16_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_depthwise_conv2d_shift_bias_store q3, a2, a10, q4, a11 -1: - tie728_s16_depthwise_conv2d_3381_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s16_depthwise_conv2d_shift_bias_store q3, a2, a10, q4, a11 - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_depthwise_conv2d_33c1_bias_relu - .type dl_tie728_s16_depthwise_conv2d_33c1_bias_relu, @function - # .section .iram1 -dl_tie728_s16_depthwise_conv2d_33c1_bias_relu: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hw81 - # a9: c_div_x_1 - # a10: mac_shift - # a11: bias_ptr - # a12: activation_alpha/_address - # a13: activation_shift - # a14: - # a15: - tie728_s16_depthwise_conv2d_33c1_load_args a4, a5, a6, a7, a8, a9, a10 - - l32i a11, a4, 68 // bias_ptr - l32i a12, a4, 76 // activation_alpha - l32i a13, a4, 84 // activation_shift - - EE.VLD.128.XP q0, a3, a6 - EE.VLD.128.IP q1, a5, 16 - EE.VLD.128.XP q2, a3, a6 - - loopgtz a9, 1f - tie728_s16_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_depthwise_conv2d_shift_bias_relu_store q3, a2, a10, q4, a11, a12, a13 -1: - tie728_s16_depthwise_conv2d_3381_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s16_depthwise_conv2d_shift_bias_relu_store q3, a2, a10, q4, a11, a12, a13 - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_depthwise_conv2d_33c1_bias_prelu - .type dl_tie728_s16_depthwise_conv2d_33c1_bias_prelu, @function - # .section .iram1 -dl_tie728_s16_depthwise_conv2d_33c1_bias_prelu: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hw81 - # a9: c_div_x_1 - # a10: mac_shift - # a11: bias_ptr - # a12: activation_alpha_ptr - # a13: activation_shift - # a14: - # a15: - tie728_s16_depthwise_conv2d_33c1_load_args a4, a5, a6, a7, a8, a9, a10 - - l32i a11, a4, 68 // bias_ptr - l32i a12, a4, 80 // activation_alpha_ptr - l32i a13, a4, 84 // activation_shift - - EE.VLD.128.XP q0, a3, a6 - EE.VLD.128.IP q1, a5, 16 - EE.VLD.128.XP q2, a3, a6 - - loopgtz a9, 1f - tie728_s16_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_depthwise_conv2d_shift_bias_prelu_store q3, a2, a10, q4, a11, q5, a12, a13 -1: - tie728_s16_depthwise_conv2d_3381_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s16_depthwise_conv2d_shift_bias_prelu_store q3, a2, a10, q4, a11, q5, a12, a13 - - retw - - - - .align 4 - .text - .global dl_tie728_s16_depthwise_conv2d_33c1 - .type dl_tie728_s16_depthwise_conv2d_33c1, @function - # .section .iram1 -dl_tie728_s16_depthwise_conv2d_33c1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hw81 - # a9: c_div_x_1 - # a10: mac_shift - # a11: - # a12: - # a13: - # a14: - # a15: - tie728_s16_depthwise_conv2d_33c1_load_args a4, a5, a6, a7, a8, a9, a10 - - # l32i a11, a4, 68 // bias_ptr - # l32i a12, a4, 76 // activation_alpha - # l32i a13, a4, 84 // activation_shift - - EE.VLD.128.XP q0, a3, a6 - EE.VLD.128.IP q1, a5, 16 - EE.VLD.128.XP q2, a3, a6 - - loopgtz a9, 1f - tie728_s16_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_depthwise_conv2d_shift_store q3, a2, a10 -1: - tie728_s16_depthwise_conv2d_3381_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s16_depthwise_conv2d_shift_store q3, a2, a10 - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_depthwise_conv2d_33c1_relu - .type dl_tie728_s16_depthwise_conv2d_33c1_relu, @function - # .section .iram1 -dl_tie728_s16_depthwise_conv2d_33c1_relu: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hw81 - # a9: c_div_x_1 - # a10: mac_shift - # a11: - # a12: activation_alpha/_address - # a13: activation_shift - # a14: - # a15: - tie728_s16_depthwise_conv2d_33c1_load_args a4, a5, a6, a7, a8, a9, a10 - - # l32i a11, a4, 68 // bias_ptr - l32i a12, a4, 76 // activation_alpha - l32i a13, a4, 84 // activation_shift - - EE.VLD.128.XP q0, a3, a6 - EE.VLD.128.IP q1, a5, 16 - EE.VLD.128.XP q2, a3, a6 - - loopgtz a9, 1f - tie728_s16_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_depthwise_conv2d_shift_relu_store q3, a2, a10, a12, a13 -1: - tie728_s16_depthwise_conv2d_3381_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s16_depthwise_conv2d_shift_relu_store q3, a2, a10, a12, a13 - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_depthwise_conv2d_33c1_prelu - .type dl_tie728_s16_depthwise_conv2d_33c1_prelu, @function - # .section .iram1 -dl_tie728_s16_depthwise_conv2d_33c1_prelu: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hw81 - # a9: c_div_x_1 - # a10: mac_shift - # a11: - # a12: activation_alpha_ptr - # a13: activation_shift - # a14: - # a15: - tie728_s16_depthwise_conv2d_33c1_load_args a4, a5, a6, a7, a8, a9, a10 - - # l32i a11, a4, 68 // bias_ptr - l32i a12, a4, 80 // activation_alpha_ptr - l32i a13, a4, 84 // activation_shift - - EE.VLD.128.XP q0, a3, a6 - EE.VLD.128.IP q1, a5, 16 - EE.VLD.128.XP q2, a3, a6 - - loopgtz a9, 1f - tie728_s16_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_depthwise_conv2d_shift_prelu_store q3, a2, a10, q4, a12, a13 -1: - tie728_s16_depthwise_conv2d_3381_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s16_depthwise_conv2d_shift_prelu_store q3, a2, a10, q4, a12, a13 - - retw - - - - -############################################################################################################################################################ -#### -#### tie728_s16_depthwise_conv2d_hwcn series -#### -############################################################################################################################################################ -.macro tie728_s16_depthwise_conv2d_1w81 input_v0 input_v1 input_v2 filter_v0 filter_v1 filter_v2 input_ptr filter_ptr dilation_x_offset dilation_y_offset filter_h filter_w filter_w_rs1_1 filter_y_offset - loopgtz \filter_w_rs1_1, 1f - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset -1: - bbci \filter_w, 0, 2f - # three 8-input-element left - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v2, \input_ptr, \dilation_y_offset - - EE.VMULAS.S16.QACC.LD.XP \filter_v2, \filter_ptr, \filter_y_offset, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 # block one cyle here - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - j 3f -2: # two 8-input-element left - EE.VMULAS.S16.QACC.LD.XP \filter_v1, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - sub \input_ptr, \input_ptr, \dilation_x_offset - add \input_ptr, \input_ptr, \dilation_y_offset - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 # block one cyle here - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset -3: -.endm - - - - -.macro tie728_s16_depthwise_conv2d_1w81_last input_v0 input_v1 filter_v0 filter_v1 input_ptr filter_ptr dilation_x_offset dilation_y_offset filter_h filter_w filter_w_rs1_1 next_hw81 filter_y_offset - loopgtz \filter_w_rs1_1, 4f - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset -4: - bbci \filter_w, 0, 5f - # three 8-input-element left - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \next_hw81 - - EE.VMULAS.S16.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v1, \filter_v1 - # block one cyle here - EE.VMULAS.S16.QACC \input_v0, \filter_v0 - j 6f -5: # two 8-input-element left - EE.VMULAS.S16.QACC.LD.XP \filter_v1, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - sub \input_ptr, \input_ptr, \dilation_x_offset - add \input_ptr, \input_ptr, \next_hw81 - EE.VMULAS.S16.QACC \input_v1, \filter_v1 -6: -.endm - - - -.macro tie728_s16_depthwise_conv2d_hw81 input_v0 input_v1 input_v2 filter_v0 filter_v1 filter_v2 input_ptr filter_ptr dilation_x_offset dilation_y_offset next_hw81 filter_h filter_w filter_w_rs1_1 args filter_offset_q filter_y_offset - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - # next_hw81 = (-(filter_width - 1) * dilation_x * input_channel_with_padding - (filter_height - 1) * dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) + 16 - # filter_w_rs1_1 - - - EE.ZERO.QACC - - l32i \filter_h, \args, 52 # filter_height - - blti \filter_w, 2, 9f - EE.VLD.128.IP \filter_v0, \filter_ptr, 16 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - blti \filter_h, 2, 8f -7: - tie728_s16_depthwise_conv2d_1w81 \input_v0, \input_v1, \input_v2, \filter_v0, \filter_v1, \filter_v2, \input_ptr, \filter_ptr, \dilation_x_offset, \dilation_y_offset, \filter_h, \filter_w, \filter_w_rs1_1, \filter_y_offset - addi \filter_h, \filter_h, -1 - bgei \filter_h, 2, 7b - -8: # last y - tie728_s16_depthwise_conv2d_1w81_last \input_v0, \input_v1, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \dilation_x_offset, \dilation_y_offset, \filter_h, \filter_w, \filter_w_rs1_1, \next_hw81, \filter_y_offset - j 12f - -# filter_w == 1 -9: - EE.VLD.128.XP \filter_v0, \filter_ptr, \filter_y_offset - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_y_offset - blti \filter_h, 2, 11f - 10: - EE.VMULAS.S16.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_y_offset - addi \filter_h, \filter_h, -1 - bgei \filter_h, 2, 10b - 11: # last y - EE.VMULAS.S16.QACC \input_v0, \filter_v0 - sub \input_ptr, \input_ptr, \dilation_y_offset - add \input_ptr, \input_ptr, \next_hw81 - -12: - EE.MOVI.32.A \filter_offset_q, \filter_h, 2 - add \filter_ptr, \filter_ptr, \filter_h -.endm - - - - -.macro tie728_s16_depthwise_conv2d_hwc1_load_args args filter_ptr dilation_x_offset dilation_y_offset next_hw81 c_div_x_1 mac_shift filter_w filter_w_rs1_1 - tie728_s16_depthwise_conv2d_33c1_load_args \args, \filter_ptr, \dilation_x_offset, \dilation_y_offset, \next_hw81, \c_div_x_1, \mac_shift - l32i \filter_w, \args, 56 - l32i \filter_w_rs1_1, \args, 148 -.endm - - - - - .align 4 - .text - .global dl_tie728_s16_depthwise_conv2d_hwc1_bias - .type dl_tie728_s16_depthwise_conv2d_hwc1_bias, @function - # .section .iram1 -dl_tie728_s16_depthwise_conv2d_hwc1_bias: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hw81 - # a9: c_div_x_1 - # a10: mac_shift - # a11: filter_h, bias_ptr - # a12: filter_w - # a13: filter_w_rs1_1 - # a14: bias_ptr - # a15: - - l32i a11, a4, 144 - l32i a15, a4, 60 - EE.MOVI.32.Q q7, a11, 2 - - tie728_s16_depthwise_conv2d_hwc1_load_args a4, a5, a6, a7, a8, a9, a10, a12, a13 - - l32i a14, a4, 68 // bias_ptr - # l32i a14, a4, 80 // activation_alpha_ptr - # l32i a15, a4, 84 // activation_shift - -tie728_s16_depthwise_conv2d_hwc1_bias_loop: - tie728_s16_depthwise_conv2d_hw81 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - tie728_s16_depthwise_conv2d_shift_bias_store q0, a2, a10, q1, a14 - - addi a9, a9, -1 - bgez a9, tie728_s16_depthwise_conv2d_hwc1_bias_loop - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_depthwise_conv2d_hwc1_bias_relu - .type dl_tie728_s16_depthwise_conv2d_hwc1_bias_relu, @function - # .section .iram1 -dl_tie728_s16_depthwise_conv2d_hwc1_bias_relu: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hw81 - # a9: c_div_x_1 - # a10: mac_shift - # a11: filter_h, activation_shift - # a12: filter_w - # a13: filter_w_rs1_1 - # a14: bias_ptr - # a15: activation_alpha - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - tie728_s16_depthwise_conv2d_hwc1_load_args a4, a5, a6, a7, a8, a9, a10, a12, a13 - - l32i a15, a4, 76 // activation_alpha - l32i a14, a4, 68 // bias_ptr - - EE.MOVI.32.Q q7, a10, 3 - -tie728_s16_depthwise_conv2d_hwc1_bias_relu_loop: - EE.MOVI.32.A q7, a10, 1 - tie728_s16_depthwise_conv2d_hw81 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a10 - l32i a11, a4, 84 // activation_shift - EE.MOVI.32.A q7, a10, 3 - tie728_s16_depthwise_conv2d_shift_bias_relu_store q0, a2, a10, q1, a14, a15, a11 - - addi a9, a9, -1 - bgez a9, tie728_s16_depthwise_conv2d_hwc1_bias_relu_loop - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_depthwise_conv2d_hwc1_bias_prelu - .type dl_tie728_s16_depthwise_conv2d_hwc1_bias_prelu, @function - # .section .iram1 -dl_tie728_s16_depthwise_conv2d_hwc1_bias_prelu: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hw81 - # a9: c_div_x_1 - # a10: mac_shift - # a11: filter_h, activation_shift - # a12: filter_w - # a13: filter_w_rs1_1 - # a14: bias_ptr - # a15: activation_alpha_ptr - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - tie728_s16_depthwise_conv2d_hwc1_load_args a4, a5, a6, a7, a8, a9, a10, a12, a13 - - l32i a15, a4, 80 // activation_alpha_ptr - l32i a14, a4, 68 // bias_ptr - EE.MOVI.32.Q q7, a10, 3 - -tie728_s16_depthwise_conv2d_hwc1_bias_prelu_loop: - EE.MOVI.32.A q7, a10, 1 - tie728_s16_depthwise_conv2d_hw81 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a10 - l32i a11, a4, 84 // activation_shift - EE.MOVI.32.A q7, a10, 3 - tie728_s16_depthwise_conv2d_shift_bias_prelu_store q0, a2, a10, q1, a14, q2, a15, a11 - - addi a9, a9, -1 - bgez a9, tie728_s16_depthwise_conv2d_hwc1_bias_prelu_loop - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_depthwise_conv2d_hwc1 - .type dl_tie728_s16_depthwise_conv2d_hwc1, @function - # .section .iram1 -dl_tie728_s16_depthwise_conv2d_hwc1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hw81 - # a9: c_div_x_1 - # a10: mac_shift - # a11: filter_h, bias_ptr - # a12: filter_w - # a13: filter_w_rs1_1 - # a14: - # a15: - - l32i a11, a4, 144 - l32i a15, a4, 60 - EE.MOVI.32.Q q7, a11, 2 - - tie728_s16_depthwise_conv2d_hwc1_load_args a4, a5, a6, a7, a8, a9, a10, a12, a13 - - # l32i a14, a4, 80 // activation_alpha_ptr - # l32i a15, a4, 84 // activation_shift - -tie728_s16_depthwise_conv2d_hwc1_loop: - tie728_s16_depthwise_conv2d_hw81 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - # l32i a11, a4, 68 // bias_ptr - tie728_s16_depthwise_conv2d_shift_store q0, a2, a10 - - addi a9, a9, -1 - bgez a9, tie728_s16_depthwise_conv2d_hwc1_loop - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_depthwise_conv2d_hwc1_relu - .type dl_tie728_s16_depthwise_conv2d_hwc1_relu, @function - # .section .iram1 -dl_tie728_s16_depthwise_conv2d_hwc1_relu: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hw81 - # a9: c_div_x_1 - # a10: mac_shift - # a11: filter_h, bias_ptr - # a12: filter_w - # a13: filter_w_rs1_1 - # a14: activation_alpha - # a15: activation_shift - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - tie728_s16_depthwise_conv2d_hwc1_load_args a4, a5, a6, a7, a8, a9, a10, a12, a13 - - l32i a14, a4, 76 // activation_alpha - l32i a15, a4, 84 // activation_shift - - EE.MOVI.32.Q q7, a10, 3 - -tie728_s16_depthwise_conv2d_hwc1_relu_loop: - EE.MOVI.32.A q7, a10, 1 - tie728_s16_depthwise_conv2d_hw81 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a10 - # l32i a11, a4, 68 // bias_ptr - EE.MOVI.32.A q7, a10, 3 - tie728_s16_depthwise_conv2d_shift_relu_store q0, a2, a10, a14, a15 - - addi a9, a9, -1 - bgez a9, tie728_s16_depthwise_conv2d_hwc1_relu_loop - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_depthwise_conv2d_hwc1_prelu - .type dl_tie728_s16_depthwise_conv2d_hwc1_prelu, @function - # .section .iram1 -dl_tie728_s16_depthwise_conv2d_hwc1_prelu: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - # a5: int16_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hw81 - # a9: c_div_x_1 - # a10: mac_shift - # a11: filter_h, bias_ptr - # a12: filter_w - # a13: filter_w_rs1_1 - # a14: activation_alpha_ptr - # a15: activation_shift - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - tie728_s16_depthwise_conv2d_hwc1_load_args a4, a5, a6, a7, a8, a9, a10, a12, a13 - - l32i a14, a4, 80 // activation_alpha_ptr - l32i a15, a4, 84 // activation_shift - - EE.MOVI.32.Q q7, a10, 3 - -tie728_s16_depthwise_conv2d_hwc1_prelu_loop: - EE.MOVI.32.A q7, a10, 1 - tie728_s16_depthwise_conv2d_hw81 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a10 - # l32i a11, a4, 68 // bias_ptr - EE.MOVI.32.A q7, a10, 3 - tie728_s16_depthwise_conv2d_shift_prelu_store q0, a2, a10, q1, a14, a15 - - addi a9, a9, -1 - bgez a9, tie728_s16_depthwise_conv2d_hwc1_prelu_loop - - retw diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_max2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_max2d.S deleted file mode 100644 index e10ac85f..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_max2d.S +++ /dev/null @@ -1,181 +0,0 @@ -#include "dl_tie728_s16.S" - - -############################################################################################################################################################ -#### -#### tie728_s16_max2d_11c series -#### -############################################################################################################################################################ - - - .align 4 - .text - .global dl_tie728_s16_max2d_11c - .type dl_tie728_s16_max2d_11c, @function - .section .iram1 -dl_tie728_s16_max2d_11c: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - - - l32i a6, a5, 64 - blti a6, 0, 5f - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.VMAX.S16.LD.INCP q0, a3, q2, q0, q1 - EE.VLD.128.IP q1, a4, 16 - EE.VST.128.IP q2, a2, 16 -0: - - EE.VMAX.S16 q2, q0, q1 - EE.VST.128.IP q2, a2, 16 -5: - retw - - - -############################################################################################################################################################ -#### -#### tie728_s16_unaligned_max2d_11c series -#### -############################################################################################################################################################ - .align 4 - .text - .global dl_tie728_s16_unaligned_max2d_11c - .type dl_tie728_s16_unaligned_max2d_11c, @function - .section .iram1 -dl_tie728_s16_unaligned_max2d_11c: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: c_remainder - - l32i a6, a5, 64 - l32i a7, a5, 76 - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie718_s16_unaligned_max2d_11c_small_remainder # channel < 8 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_s16_unaligned_max2d_11c_0 - beqi a13, 8, dl_tie718_s16_unaligned_max2d_11c_1 - - loopgtz a6, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S16 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - - 0: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S16 q2, q2, q5 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie718_s16_unaligned_max2d_11c_remainder - -dl_tie718_s16_unaligned_max2d_11c_0: - - loopgtz a6, 1f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S16 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S16 q2, q2, q5 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_s16_unaligned_max2d_11c_remainder - -dl_tie718_s16_unaligned_max2d_11c_1: - - loopgtz a6, 2f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S16 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S16 q2, q2, q5 - dl_tie728_128b_unaligned_store1 q2, a2 - - j dl_tie718_s16_unaligned_max2d_11c_remainder - -dl_tie718_s16_unaligned_max2d_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a7 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a7 - rur.sar_byte a12 - -dl_tie718_s16_unaligned_max2d_11c_remainder: - - - beqz a7, dl_tie728_s16_unaligned_max2d_11c_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VMAX.S16 q2, q2, q5 - srli a7, a7, 1 - dl_tie728_s16_store_remainder q2, a7, a12, a2 - -dl_tie728_s16_unaligned_max2d_11c_end: - retw diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_max_pool2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_max_pool2d.S deleted file mode 100644 index 31dec1cb..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_max_pool2d.S +++ /dev/null @@ -1,538 +0,0 @@ -#include "dl_tie728_s16.S" - -############################################################################################################################################################ -#### -#### dl_tie728_s16_max_pool2d series -#### -############################################################################################################################################################ - - .align 4 - .text - .global dl_tie728_s16_max_pool2d_22c1 - .type dl_tie728_s16_max_pool2d_22c1, @function - .section .iram1 -dl_tie728_s16_max_pool2d_22c1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - # a5: input_y_offset_bytes - # a6: input_x_offset_bytes - # a10: c_div_x_1 - - l32i a5, a4, 16 # input_y_offset_bytes - l32i a6, a4, 20 # input_x_offset_bytes - l32i a10, a4, 104 # c_div_x_1 - - add a7, a3, a6 - add a8, a3, a5 - add a9, a8, a6 - -0: - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a7, 16 - loopgtz a10, 1f - EE.VMAX.S16.LD.INCP q2, a8, q7, q0, q1 - EE.VMAX.S16.LD.INCP q3, a9, q7, q7, q2 - EE.VMAX.S16.LD.INCP q0, a3, q7, q7, q3 - EE.VST.128.IP q7, a2, 16 - EE.VLD.128.IP q1, a7, 16 -1: - EE.VMAX.S16.LD.INCP q2, a8, q7, q0, q1 - EE.VMAX.S16.LD.INCP q3, a9, q7, q7, q2 - EE.VMAX.S16 q7, q7, q3 - EE.VST.128.IP q7, a2, 16 - retw - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_max_pool2d_22c1 - .type dl_tie728_s16_unaligned_max_pool2d_22c1, @function - .section .iram1 -dl_tie728_s16_unaligned_max_pool2d_22c1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - # a5: input_y_offset_bytes - # a6: input_x_offset_bytes - # a10: c_div_x_1 - # a12: c_remainder_bytes - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a10, a4, 104 - l32i a12, a4, 60 # c_remainder - - add a7, a3, a6 - add a8, a3, a5 - add a9, a8, a6 - - blti a10, 0, dl_tie728_s16_unaligned_max_pool2d_22c1_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - RUR.SAR_BYTE a13 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q1, a3, 0 - - beqi a13, 0, 0f - beqi a13, 8, 8f - - loopgtz a10, 1f - EE.SRC.Q.LD.IP q2, a7, 16, q0, q1 - EE.LD.128.USAR.IP q3, a7, 0 - EE.SRC.Q.LD.IP q4, a8, 16, q2, q3 - - EE.LD.128.USAR.IP q5, a8, 0 - EE.VMAX.S16 q7, q0, q2 - EE.SRC.Q.LD.IP q2, a9, 16, q4, q5 - - EE.LD.128.USAR.IP q3, a9, 0 - EE.VMAX.S16 q7, q7, q4 - EE.SRC.Q.LD.IP q0, a3, 16, q2, q3 - - EE.LD.128.USAR.IP q1, a3, 0 - EE.VMAX.S16 q7, q7, q2 - - dl_tie728_128b_unaligned_store0 q7, a2, a13 -1: - j dl_tie728_s16_unaligned_max_pool2d_22c1_end - -0: - loopgtz a10, 2f - EE.SRC.Q.LD.IP q2, a7, 16, q0, q1 - EE.LD.128.USAR.IP q3, a7, 0 - EE.SRC.Q.LD.IP q4, a8, 16, q2, q3 - - EE.LD.128.USAR.IP q5, a8, 0 - EE.VMAX.S16 q7, q0, q2 - EE.SRC.Q.LD.IP q2, a9, 16, q4, q5 - - EE.LD.128.USAR.IP q3, a9, 0 - EE.VMAX.S16 q7, q7, q4 - EE.SRC.Q.LD.IP q0, a3, 16, q2, q3 - - EE.LD.128.USAR.IP q1, a3, 0 - EE.VMAX.S16 q7, q7, q2 - - EE.VST.128.IP q7, a2, 16 -2: - j dl_tie728_s16_unaligned_max_pool2d_22c1_end -8: - loopgtz a10, 3f - EE.SRC.Q.LD.IP q2, a7, 16, q0, q1 - EE.LD.128.USAR.IP q3, a7, 0 - EE.SRC.Q.LD.IP q4, a8, 16, q2, q3 - - EE.LD.128.USAR.IP q5, a8, 0 - EE.VMAX.S16 q7, q0, q2 - EE.SRC.Q.LD.IP q2, a9, 16, q4, q5 - - EE.LD.128.USAR.IP q3, a9, 0 - EE.VMAX.S16 q7, q7, q4 - EE.SRC.Q.LD.IP q0, a3, 16, q2, q3 - - EE.LD.128.USAR.IP q1, a3, 0 - EE.VMAX.S16 q7, q7, q2 - - dl_tie728_128b_unaligned_store1 q7, a2 -3: - j dl_tie728_s16_unaligned_max_pool2d_22c1_end - -dl_tie728_s16_unaligned_max_pool2d_22c1_end: - EE.SRC.Q.LD.IP q2, a7, 16, q0, q1 - EE.LD.128.USAR.IP q3, a7, 0 - EE.SRC.Q.LD.IP q4, a8, 16, q2, q3 - - EE.LD.128.USAR.IP q5, a8, 0 - EE.VMAX.S16 q7, q0, q2 - EE.SRC.Q.LD.IP q2, a9, 16, q4, q5 - - EE.LD.128.USAR.IP q3, a9, 0 - EE.VMAX.S16 q7, q7, q4 - EE.SRC.Q q2, q2, q3 - EE.VMAX.S16 q7, q7, q2 - - dl_tie728_128b_unaligned_store0 q7, a2, a13 - - beqz a12, 4f - -dl_tie728_s16_unaligned_max_pool2d_22c1_remainder: - EE.LD.128.USAR.XP q0, a3, a12 - EE.VLD.128.IP q1, a3, 0 - EE.SRC.Q q0, q0, q1 - - EE.LD.128.USAR.XP q2, a7, a12 - EE.VLD.128.IP q3, a7, 0 - EE.SRC.Q q2, q2, q3 - - EE.LD.128.USAR.XP q4, a8, a12 - EE.VLD.128.IP q5, a8, 0 - EE.VMAX.S16 q7, q0, q2 - EE.SRC.Q q4, q4, q5 - - EE.LD.128.USAR.XP q0, a9, a12 - EE.VLD.128.IP q1, a9, 0 - EE.VMAX.S16 q7, q7, q4 - EE.SRC.Q q0, q0, q1 - - EE.VMAX.S16 q7, q7, q0 - srli a12, a12, 1 - dl_tie728_s16_store_remainder q7, a12, a14, a2 -4: - retw - - - - - .align 4 - .text - .global dl_tie728_s16_max_pool2d_hwc1 - .type dl_tie728_s16_max_pool2d_hwc1, @function - .section .iram1 -dl_tie728_s16_max_pool2d_hwc1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - # a5: input_y_offset - # a6: input_x_offset - # a7: c_div_x_1 - # a8: filter_height - # a9: filter_width - # a10: filter_width/2 - 1 - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a7, a4, 104 # c_div_x_1 - l32i a8, a4, 48 # filter_height - l32i a9, a4, 52 # filter_width - - srli a10, a9, 1 - addi a10, a10, -1 # w / 2 - 1 - - blti a9, 2, dl_tie728_s16_max_pool2d_h1c1_loop - blti a7, 1, dl_tie728_s16_max_pool2d_hwc1_small_channel - -1: # loop c - mov a11, a3 - mov a13, a11 - EE.VLD.128.IP q7, a13, 0 - mov a14, a8 - -2: # loop h - EE.VLD.128.XP q0, a13, a6 - loopgtz a10, 3f # loop w - EE.VLD.128.XP q1, a13, a6 - EE.VMAX.S16 q7, q7, q0 - EE.VLD.128.XP q0, a13, a6 - EE.VMAX.S16 q7, q7, q1 -3: - bbci a9, 0, 4f - # w left 3 - EE.VLD.128.XP q1, a13, a6 - EE.VMAX.S16 q7, q7, q0 - EE.VLD.128.XP q0, a13, a6 - EE.VMAX.S16 q7, q7, q1 - EE.VMAX.S16 q7, q7, q0 - j 5f - -4: # w left 2 - EE.VLD.128.XP q1, a13, a6 - EE.VMAX.S16 q7, q7, q0 - EE.VMAX.S16 q7, q7, q1 - -5: - addi a14, a14, -1 - add a11, a11, a5 - mov a13, a11 - bnez a14, 2b -6: - EE.VST.128.IP q7, a2, 16 - addi a3, a3, 16 - addi a7, a7, -1 - bnez a7, 1b - - -dl_tie728_s16_max_pool2d_hwc1_small_channel: - - mov a11, a3 - mov a13, a11 - EE.VLD.128.IP q7, a13, 0 - mov a14, a8 - -2: # loop h - EE.VLD.128.XP q0, a13, a6 - loopgtz a10, 3f # loop w - EE.VLD.128.XP q1, a13, a6 - EE.VMAX.S16 q7, q7, q0 - EE.VLD.128.XP q0, a13, a6 - EE.VMAX.S16 q7, q7, q1 -3: - bbci a9, 0, 4f - # w left 3 - EE.VLD.128.XP q1, a13, a6 - EE.VMAX.S16 q7, q7, q0 - EE.VLD.128.XP q0, a13, a6 - EE.VMAX.S16 q7, q7, q1 - EE.VMAX.S16 q7, q7, q0 - j 5f - -4: # w left 2 - EE.VLD.128.XP q1, a13, a6 - EE.VMAX.S16 q7, q7, q0 - EE.VMAX.S16 q7, q7, q1 - -5: - addi a14, a14, -1 - add a11, a11, a5 - mov a13, a11 - bnez a14, 2b -6: - EE.VST.128.IP q7, a2, 16 - retw - -dl_tie728_s16_max_pool2d_h1c1_loop: - blti a7, 1, dl_tie728_s16_max_pool2d_h1c1_small_channel -1: - mov a13, a3 - EE.VLD.128.IP q7, a13, 0 - loopgtz a8, 2f - EE.VLD.128.XP q0, a13, a5 - EE.VMAX.S16 q7, q7, q0 -2: - EE.VST.128.IP q7, a2, 16 - addi a3, a3, 16 - addi a7, a7, -1 - bnez a7, 1b - -dl_tie728_s16_max_pool2d_h1c1_small_channel: - mov a13, a3 - EE.VLD.128.IP q7, a13, 0 - loopgtz a8, 1f - EE.VLD.128.XP q0, a13, a5 - EE.VMAX.S16 q7, q7, q0 -1: - EE.VST.128.IP q7, a2, 16 - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_max_pool2d_hwc1 - .type dl_tie728_s16_unaligned_max_pool2d_hwc1, @function - .section .iram1 -dl_tie728_s16_unaligned_max_pool2d_hwc1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - # a5: input_y_offset - # a6: input_x_offset - # a7: c_div_x_1 - # a8: filter_height - # a9: filter_width - # a10: filter_width/2 - 1 - # a12: c_remainder_bytes - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a7, a4, 104 # c_div_x_1 - l32i a8, a4, 48 # filter_height - l32i a9, a4, 52 # filter_width - l32i a12, a4, 60 # c_remainder_bytes - - srli a10, a9, 1 - addi a10, a10, -1 # w/2-1 - - addi a6, a6, -16 - - EE.LD.128.USAR.IP q0, a2, 0 - RUR.SAR_BYTE a15 - - addi a7, a7, 1 - - blti a9, 2, dl_tie728_s16_unaligned_max_pool2d_h1c1_loop - blti a7, 1, dl_tie728_s16_unaligned_max_pool2d_hwc1_small_channel - - 1: # loop c - mov a11, a3 - mov a13, a11 - EE.LD.128.USAR.IP q0, a13, 16 - EE.LD.128.USAR.IP q1, a13, -16 - EE.SRC.Q q7, q0, q1 - mov a14, a8 - -2: # loop h - EE.LD.128.USAR.IP q0, a13, 16 - EE.LD.128.USAR.XP q1, a13, a6 - loopgtz a10, 3f # loop w - EE.SRC.Q.LD.IP q2, a13, 16, q0, q1 - EE.LD.128.USAR.XP q3, a13, a6 - EE.VMAX.S16 q7, q7, q0 - - EE.SRC.Q.LD.IP q0, a13, 16, q2, q3 - EE.LD.128.USAR.XP q1, a13, a6 - EE.VMAX.S16 q7, q7, q2 -3: - bbci a9, 0, 4f - # w left 3 - EE.SRC.Q.LD.IP q2, a13, 16, q0, q1 - EE.LD.128.USAR.XP q3, a13, a6 - EE.VMAX.S16 q7, q7, q0 - - EE.SRC.Q.LD.IP q0, a13, 16, q2, q3 - EE.LD.128.USAR.XP q1, a13, a6 - EE.VMAX.S16 q7, q7, q2 - - EE.SRC.Q q0, q0, q1 - EE.VMAX.S16 q7, q7, q0 - j 5f - -4: # w left 2 - EE.SRC.Q.LD.IP q2, a13, 16, q0, q1 - EE.LD.128.USAR.XP q3, a13, a6 - EE.VMAX.S16 q7, q7, q0 - - EE.SRC.Q q2, q2, q3 - EE.VMAX.S16 q7, q7, q2 - -5: - addi a14, a14, -1 - add a11, a11, a5 - mov a13, a11 - bnez a14, 2b -6: - beqi a15, 0, 7f - beqi a15, 8, 8f - dl_tie728_128b_unaligned_store0 q7, a2, a14 - j 9f -7: - EE.VST.128.IP q7, a2, 16 - j 9f -8: - dl_tie728_128b_unaligned_store1 q7, a2 -9: - addi a3, a3, 16 - addi a7, a7, -1 - bnez a7, 1b - - -dl_tie728_s16_unaligned_max_pool2d_hwc1_small_channel: - beqz a12, 9f - - mov a11, a3 - mov a13, a11 - EE.LD.128.USAR.IP q0, a13, 16 - EE.LD.128.USAR.IP q1, a13, -16 - EE.SRC.Q q7, q0, q1 - mov a14, a8 - -2: # loop h - EE.LD.128.USAR.IP q0, a13, 16 - EE.LD.128.USAR.XP q1, a13, a6 - loopgtz a10, 3f # loop w - EE.SRC.Q.LD.IP q2, a13, 16, q0, q1 - EE.LD.128.USAR.XP q3, a13, a6 - EE.VMAX.S16 q7, q7, q0 - - EE.SRC.Q.LD.IP q0, a13, 16, q2, q3 - EE.LD.128.USAR.XP q1, a13, a6 - EE.VMAX.S16 q7, q7, q2 -3: - bbci a9, 0, 4f - # w left 3 - EE.SRC.Q.LD.IP q2, a13, 16, q0, q1 - EE.LD.128.USAR.XP q3, a13, a6 - EE.VMAX.S16 q7, q7, q0 - - EE.SRC.Q q2, q2, q3 - EE.LD.128.USAR.XP q0, a13, a12 - EE.VLD.128.IP q1, a13, 0 - EE.VMAX.S16 q7, q7, q2 - - EE.SRC.Q q0, q0, q1 - EE.VMAX.S16 q7, q7, q0 - j 5f - -4: # w left 2 - EE.SRC.Q q0, q0, q1 - EE.LD.128.USAR.XP q2, a13, a12 - EE.VLD.128.IP q3, a13, 0 - EE.VMAX.S16 q7, q7, q0 - - EE.SRC.Q q2, q2, q3 - EE.VMAX.S16 q7, q7, q2 - -5: - addi a14, a14, -1 - add a11, a11, a5 - mov a13, a11 - bnez a14, 2b -6: - srli a12, a12, 1 - dl_tie728_s16_store_remainder q7, a12, a14, a2 -9: - retw - - -dl_tie728_s16_unaligned_max_pool2d_h1c1_loop: - addi a5, a5, -16 - blti a7, 1, dl_tie728_s16_unaligned_max_pool2d_h1c1_small_channel -1: - mov a13, a3 - EE.LD.128.USAR.IP q0, a13, 16 - EE.LD.128.USAR.IP q1, a13, -16 - EE.SRC.Q q7, q0, q1 - - loopgtz a8, 2f - EE.LD.128.USAR.IP q0, a13, 16 - EE.LD.128.USAR.XP q1, a13, a5 - EE.SRC.Q q0, q0, q1 - EE.VMAX.S16 q7, q7, q0 -2: - beqi a15, 0, 3f - beqi a15, 8, 4f - dl_tie728_128b_unaligned_store0 q7, a2, a14 - j 5f -3: - EE.VST.128.IP q7, a2, 16 - j 5f -4: - dl_tie728_128b_unaligned_store1 q7, a2 -5: - addi a3, a3, 16 - addi a7, a7, -1 - bnez a7, 1b - -dl_tie728_s16_unaligned_max_pool2d_h1c1_small_channel: - beqz a12, 5f - - mov a13, a3 - EE.LD.128.USAR.XP q0, a13, a12 - EE.VLD.128.IP q1, a13, 0 - EE.SRC.Q q7, q0, q1 - sub a13, a13, a12 - addi a5, a5, 16 - sub a5, a5, a12 - - loopgtz a8, 1f - EE.LD.128.USAR.XP q0, a13, a12 - EE.VLD.128.XP q1, a13, a5 - EE.SRC.Q q0, q0, q1 - EE.VMAX.S16 q7, q7, q0 -1: - srli a12, a12, 1 - dl_tie728_s16_store_remainder q7, a12, a14, a2 -5: - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_min2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_min2d.S deleted file mode 100644 index c426f9e0..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_min2d.S +++ /dev/null @@ -1,186 +0,0 @@ -#include "dl_tie728_s16.S" - - -############################################################################################################################################################ -#### -#### tie728_s16_min2d_11c series -#### -############################################################################################################################################################ - - - .align 4 - .text - .global dl_tie728_s16_min2d_11c - .type dl_tie728_s16_min2d_11c, @function - .section .iram1 -dl_tie728_s16_min2d_11c: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - - - l32i a6, a5, 64 - blti a6, 0, 5f - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.VMIN.s16.LD.INCP q0, a3, q2, q0, q1 - EE.VLD.128.IP q1, a4, 16 - EE.VST.128.IP q2, a2, 16 - 0: - - EE.VMIN.S16 q2, q0, q1 - EE.VST.128.IP q2, a2, 16 -5: - retw - - - -############################################################################################################################################################ -#### -#### tie728_s16_unaligned_min2d_11c series -#### -############################################################################################################################################################ - .align 4 - .text - .global dl_tie728_s16_unaligned_min2d_11c - .type dl_tie728_s16_unaligned_min2d_11c, @function - .section .iram1 -dl_tie728_s16_unaligned_min2d_11c: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: c_remainder - - - l32i a6, a5, 64 - l32i a7, a5, 76 - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie718_s16_unaligned_min2d_11c_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_s16_unaligned_min2d_11c_0 - beqi a13, 8, dl_tie718_s16_unaligned_min2d_11c_1 - - - loopgtz a6, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMIN.S16 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMIN.S16 q2, q2, q5 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie718_s16_unaligned_min2d_11c_remainder - -dl_tie718_s16_unaligned_min2d_11c_0: - - loopgtz a6, 1f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMIN.S16 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMIN.S16 q2, q2, q5 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_s16_unaligned_min2d_11c_remainder - -dl_tie718_s16_unaligned_min2d_11c_1: - - loopgtz a6, 2f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMIN.S16 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMIN.S16 q2, q2, q5 - dl_tie728_128b_unaligned_store1 q2, a2 - - j dl_tie718_s16_unaligned_min2d_11c_remainder - -dl_tie718_s16_unaligned_min2d_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a7 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a7 - rur.sar_byte a12 - -dl_tie718_s16_unaligned_min2d_11c_remainder: - - - beqz a7, dl_tie728_s16_unaligned_min2d_11c_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VMIN.S16 q2, q2, q5 - srli a7, a7, 1 - dl_tie728_s16_store_remainder q2, a7, a12, a2 - -dl_tie728_s16_unaligned_min2d_11c_end: - retw - - diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_mul2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_mul2d.S deleted file mode 100644 index 000c139a..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_mul2d.S +++ /dev/null @@ -1,639 +0,0 @@ -# include "dl_tie728_s16.S" - -############################################################################################################################################################ -#### -#### tie728_s16_mul2d_11c series -#### -############################################################################################################################################################ - - .align 4 - .text - .global dl_tie728_s16_mul2d_11c - .type dl_tie728_s16_mul2d_11c, @function - .section .iram1 -dl_tie728_s16_mul2d_11c: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: mul_shift - - - l32i a6, a5, 64 - l32i a7, a5, 100 - blti a6, 0, 5f - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.ZERO.QACC - EE.VMULAS.S16.QACC.LD.IP q0, a3, 16, q0, q1 - EE.VLD.128.IP q1, a4, 16 - EE.SRCMB.S16.QACC q2, a7, 0 - EE.VST.128.IP q2, a2, 16 - 0: - - EE.ZERO.QACC - EE.VMULAS.S16.QACC q0, q1 - EE.SRCMB.S16.QACC q2, a7, 0 - EE.VST.128.IP q2, a2, 16 -5: - retw - - - - .align 4 - .text - .global dl_tie728_s16_mul2d_11c_relu - .type dl_tie728_s16_mul2d_11c_relu, @function - .section .iram1 -dl_tie728_s16_mul2d_11c_relu: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: mul_shift - # a14: activation_alpha - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 100 - l32i a8, a5, 76 - l32i a14, a5, 52 - l32i a15, a5, 60 - blti a6, 0, 5f - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.ZERO.QACC - EE.VMULAS.S16.QACC.LD.IP q0, a3, 16, q0, q1 - EE.VLD.128.IP q1, a4, 16 - EE.SRCMB.S16.QACC q2, a7, 0 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - 0: - - EE.ZERO.QACC - EE.VMULAS.S16.QACC q0, q1 - EE.SRCMB.S16.QACC q2, a7, 0 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 -5: - retw - - - .align 4 - .text - .global dl_tie728_s16_mul2d_11c_prelu - .type dl_tie728_s16_mul2d_11c_prelu, @function - .section .iram1 -dl_tie728_s16_mul2d_11c_prelu: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: mul_shift - # a14: activation_alpha_ptr - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 100 - l32i a14, a5, 56 - l32i a15, a5, 60 - blti a6, 0, 5f - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.ZERO.QACC - EE.VMULAS.S16.QACC.LD.IP q0, a3, 16, q0, q1 - EE.VLD.128.IP q1, a4, 16 - - EE.VLD.128.IP q3, a14, 16 - EE.SRCMB.S16.QACC q2, a7, 0 - EE.VPRELU.S16 q2, q2, q3, a15 - EE.VST.128.IP q2, a2, 16 - 0: - - EE.ZERO.QACC - EE.VMULAS.S16.QACC q0, q1 - EE.VLD.128.IP q3, a14, 16 - EE.SRCMB.S16.QACC q2, a7, 0 - EE.VPRELU.S16 q2, q2, q3, a15 - EE.VST.128.IP q2, a2, 16 -5: - retw - - -############################################################################################################################################################ -#### -#### tie728_S16_unaligned_mul2d_11c series -#### -############################################################################################################################################################ - - .align 4 - .text - .global dl_tie728_s16_unaligned_mul2d_11c - .type dl_tie728_s16_unaligned_mul2d_11c, @function - .section .iram1 -dl_tie728_s16_unaligned_mul2d_11c: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: c_remainder - # a8: mul_shift - - - l32i a6, a5, 64 - l32i a7, a5, 76 - l32i a8, a5, 100 - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie718_S16_unaligned_mul2d_11c_small_remainder # channel < 8 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_S16_unaligned_mul2d_11c_0 - beqi a13, 8, dl_tie718_S16_unaligned_mul2d_11c_1 - - loopgtz a6, 0f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - 0: - - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie718_S16_unaligned_mul2d_11c_remainder - -dl_tie718_S16_unaligned_mul2d_11c_0: - - loopgtz a6, 1f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VST.128.IP q2, a2, 16 - 1: - - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_S16_unaligned_mul2d_11c_remainder - -dl_tie718_S16_unaligned_mul2d_11c_1: - - loopgtz a6, 2f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store1 q2, a2 - 2: - - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - dl_tie728_128b_unaligned_store1 q2, a2 - - j dl_tie718_S16_unaligned_mul2d_11c_remainder - -dl_tie718_S16_unaligned_mul2d_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a7 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a7 - rur.sar_byte a12 - -dl_tie718_S16_unaligned_mul2d_11c_remainder: - - - beqz a7, dl_tie728_S16_unaligned_mul2d_11c_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.ZERO.QACC - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - - srli a7, a7, 1 - dl_tie728_s16_store_remainder q2, a7, a12, a2 - -dl_tie728_S16_unaligned_mul2d_11c_end: - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_mul2d_11c_relu - .type dl_tie728_s16_unaligned_mul2d_11c_relu, @function - .section .iram1 -dl_tie728_s16_unaligned_mul2d_11c_relu: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: c_remainder - # a8: mul_shift - # a14: activation_alpha - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 76 - l32i a8, a5, 100 - l32i a14, a5, 52 - l32i a15, a5, 60 - - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie718_S16_unaligned_mul2d_11c_relu_small_remainder # channel < 8 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_S16_unaligned_mul2d_11c_relu_0 - beqi a13, 8, dl_tie718_S16_unaligned_mul2d_11c_relu_1 - - - loopgtz a6, 0f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie718_S16_unaligned_mul2d_11c_relu_remainder - -dl_tie718_S16_unaligned_mul2d_11c_relu_0: - - loopgtz a6, 1f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_S16_unaligned_mul2d_11c_relu_remainder - -dl_tie718_S16_unaligned_mul2d_11c_relu_1: - - loopgtz a6, 2f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store1 q2, a2 - j dl_tie718_S16_unaligned_mul2d_11c_relu_remainder - -dl_tie718_S16_unaligned_mul2d_11c_relu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a7 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a7 - rur.sar_byte a12 - -dl_tie718_S16_unaligned_mul2d_11c_relu_remainder: - - - beqz a7, dl_tie728_S16_unaligned_mul2d_11c_relu_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.ZERO.QACC - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.VRELU.S16 q2, a14, a15 - srli a7, a7, 1 - dl_tie728_s16_store_remainder q2, a7, a12, a2 - -dl_tie728_S16_unaligned_mul2d_11c_relu_end: - retw - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_mul2d_11c_prelu - .type dl_tie728_s16_unaligned_mul2d_11c_prelu, @function - .section .iram1 -dl_tie728_s16_unaligned_mul2d_11c_prelu: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: c_remainder - # a8: mul_shift - # a14: activation_alpha_ptr - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 76 - l32i a8, a5, 100 - l32i a14, a5, 56 - l32i a15, a5, 60 - - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie718_S16_unaligned_mul2d_11c_prelu_small_remainder # channel < 8 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_S16_unaligned_mul2d_11c_prelu_0 - beqi a13, 8, dl_tie718_S16_unaligned_mul2d_11c_prelu_1 - - - loopgtz a6, 0f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie718_S16_unaligned_mul2d_11c_prelu_remainder - -dl_tie718_S16_unaligned_mul2d_11c_prelu_0: - - loopgtz a6, 1f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.VPRELU.S16 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_S16_unaligned_mul2d_11c_prelu_remainder - -dl_tie718_S16_unaligned_mul2d_11c_prelu_1: - - loopgtz a6, 2f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S16.QACC q2, q5 - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store1 q2, a2 - j dl_tie718_S16_unaligned_mul2d_11c_prelu_remainder - - -dl_tie718_S16_unaligned_mul2d_11c_prelu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a7 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a7 - rur.sar_byte a12 - -dl_tie718_S16_unaligned_mul2d_11c_prelu_remainder: - - beqz a7, dl_tie728_S16_unaligned_mul2d_11c_prelu_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.ZERO.QACC - EE.VMULAS.S16.QACC q2, q5 - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S16.QACC q2, a8, 0 - EE.VPRELU.S16 q2, q2, q6, a15 - srli a7, a7, 1 - dl_tie728_s16_store_remainder q2, a7, a12, a2 - -dl_tie728_S16_unaligned_mul2d_11c_prelu_end: - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_prelu.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_prelu.S deleted file mode 100644 index 026bc9df..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_prelu.S +++ /dev/null @@ -1,174 +0,0 @@ -#include "dl_tie728_s16.S" - - .align 4 - .text - .global dl_tie728_s16_prelu_11c - .type dl_tie728_s16_prelu_11c, @function - .section .iram1 -dl_tie728_s16_prelu_11c: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - # a5: c_rs1_1: c / 2x - 1 - # a6: c_rs2_1: c_left_1 - # a14: activation_alpha_ptr - # a15: activation_shift - - - l32i a5, a4, 88 - l32i a6, a4, 92 - l32i a14, a4, 80 # activation_alpha_ptr - l32i a15, a4, 84 # activation_shift - - - loopgtz a5, 0f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q2, a14, 16 - EE.VLD.128.IP q1, a3, 16 - EE.VLD.128.IP q3, a14, 16 - - EE.VPRELU.S16 q0, q0, q2, a15 - EE.VST.128.IP q0, a2, 16 - - EE.VPRELU.S16 q1, q1, q3, a15 - EE.VST.128.IP q1, a2, 16 - 0: - - blti a6, 0, 5f - loopgtz a6, 1f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q2, a14, 16 - - EE.VPRELU.S16 q0, q0, q2, a15 - EE.VST.128.IP q0, a2, 16 - 1: - - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q2, a14, 16 - - EE.VPRELU.S16 q0, q0, q2, a15 - EE.VST.128.IP q0, a2, 16 -5: - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_prelu_11c - .type dl_tie728_s16_unaligned_prelu_11c, @function - .section .iram1 -dl_tie728_s16_unaligned_prelu_11c: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - # a5: c_div_x_1 - # a6: c_remainder - # a14: activation_alpha_ptr - # a15: activation_shift - - - l32i a5, a4, 100 - l32i a6, a4, 136 - l32i a14, a4, 80 # activation_alpha_ptr - l32i a15, a4, 84 # activation_shift - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a5, 0, dl_tie718_s16_unaligned_prelu_11c_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_s16_unaligned_prelu_11c_0 - beqi a13, 8, dl_tie718_s16_unaligned_prelu_11c_1 - - - loopgtz a5, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.VLD.128.IP q3, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q3, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a6 - rur.sar_byte a11 - EE.VLD.128.IP q3, a14, 16 - EE.SRC.Q.QUP q2, q0, q1 - EE.VPRELU.S16 q2, q2, q3, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie718_s16_unaligned_prelu_11c_remainder - - -dl_tie718_s16_unaligned_prelu_11c_0: - loopgtz a5, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.VLD.128.IP q3, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q3, a15 - EE.VST.128.IP q2, a2, 16 - 0: - addi a3, a3, -16 - add a3, a3, a6 - rur.sar_byte a11 - EE.VLD.128.IP q3, a14, 16 - EE.SRC.Q.QUP q2, q0, q1 - EE.VPRELU.S16 q2, q2, q3, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_s16_unaligned_prelu_11c_remainder - - -dl_tie718_s16_unaligned_prelu_11c_1: - - loopgtz a5, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.VLD.128.IP q3, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q3, a15 - dl_tie728_128b_unaligned_store1 q2, a2 - 0: - addi a3, a3, -16 - add a3, a3, a6 - rur.sar_byte a11 - EE.VLD.128.IP q3, a14, 16 - EE.SRC.Q.QUP q2, q0, q1 - EE.VPRELU.S16 q2, q2, q3, a15 - dl_tie728_128b_unaligned_store1 q2, a2 - j dl_tie718_s16_unaligned_prelu_11c_remainder - - -dl_tie718_s16_unaligned_prelu_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a6 - rur.sar_byte a11 - - -dl_tie718_s16_unaligned_prelu_11c_remainder: - - beqz a6, dl_tie728_s16_unaligned_prelu_11c_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.VLD.128.IP q3, a14, 16 - EE.VPRELU.S16 q2, q2, q3, a15 - srli a6, a6, 1 - dl_tie728_s16_store_remainder q2, a6, a13, a2 - -dl_tie728_s16_unaligned_prelu_11c_end: - retw diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_relu.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_relu.S deleted file mode 100644 index b7361b13..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_relu.S +++ /dev/null @@ -1,161 +0,0 @@ -#include "dl_tie728_s16.S" - - .align 4 - .text - .global dl_tie728_s16_relu_11c - .type dl_tie728_s16_relu_11c, @function - .section .iram1 -dl_tie728_s16_relu_11c: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - # a5: c_rs1_1: c / 2x - 1 - # a6: c_rs2_1: c_left_1 - # a14: activation_alpha - # a15: activation_shift - - - l32i a5, a4, 88 - l32i a6, a4, 92 - l32i a14, a4, 76 # activation_alpha - l32i a15, a4, 84 # activation_shift - - - loopgtz a5, 0f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a3, 16 - - EE.VRELU.S16 q0, a14, a15 - EE.VST.128.IP q0, a2, 16 - - EE.VRELU.S16 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - 0: - - blti a6, 0, 5f - loopgtz a6, 1f - EE.VLD.128.IP q0, a3, 16 - EE.VRELU.S16 q0, a14, a15 - EE.VST.128.IP q0, a2, 16 - 1: - - EE.VLD.128.IP q0, a3, 16 - EE.VRELU.S16 q0, a14, a15 - EE.VST.128.IP q0, a2, 16 -5: - retw - - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_relu_11c - .type dl_tie728_s16_unaligned_relu_11c, @function - .section .iram1 -dl_tie728_s16_unaligned_relu_11c: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - # a5: c_div_x_1 - # a6: c_remainder - # a14: activation_alpha - # a15: activation_shift - - - l32i a5, a4, 100 - l32i a6, a4, 136 - l32i a14, a4, 76 # activation_alpha - l32i a15, a4, 84 # activation_shift - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a5, 0, dl_tie718_s16_unaligned_relu_11c_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_s16_unaligned_relu_11c_0 - beqi a13, 8, dl_tie718_s16_unaligned_relu_11c_1 - - - loopgtz a5, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a6 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie718_s16_unaligned_relu_11c_remainder - - -dl_tie718_s16_unaligned_relu_11c_0: - loopgtz a5, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - 0: - addi a3, a3, -16 - add a3, a3, a6 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_s16_unaligned_relu_11c_remainder - - -dl_tie718_s16_unaligned_relu_11c_1: - - loopgtz a5, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store1 q2, a2 - 0: - addi a3, a3, -16 - add a3, a3, a6 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store1 q2, a2 - j dl_tie718_s16_unaligned_relu_11c_remainder - - -dl_tie718_s16_unaligned_relu_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a6 - rur.sar_byte a11 - - -dl_tie718_s16_unaligned_relu_11c_remainder: - - beqz a6, dl_tie728_s16_unaligned_relu_11c_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.VRELU.S16 q2, a14, a15 - srli a6, a6, 1 - dl_tie728_s16_store_remainder q2, a6, a13, a2 - -dl_tie728_s16_unaligned_relu_11c_end: - retw diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_sub2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_sub2d.S deleted file mode 100644 index 75ce9935..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_sub2d.S +++ /dev/null @@ -1,1595 +0,0 @@ -# include "dl_tie728_s16.S" - -############################################################################################################################################################ -#### -#### tie728_s16_sub2d_11c series -#### -############################################################################################################################################################ -.macro dl_tie728_s16_rescale_sub_rescale_output input0, input1, output, output_scale, output_shift, neg_output_scale, rescale_input - EE.ZERO.QACC - blti \rescale_input, 2, 100f - # input1 is in the front - EE.VMULAS.s16.QACC \input0, \neg_output_scale - EE.VMULAS.s16.QACC \input1, \output_scale - j 101f - -100: #input0 is in the front - EE.VMULAS.s16.QACC \input0, \output_scale - EE.VMULAS.s16.QACC \input1, \neg_output_scale -101: - EE.SRCMB.S16.QACC \output, \output_shift, 0 -.endm - - - - .align 4 - .text - .global dl_tie728_s16_sub2d_11c - .type dl_tie728_s16_sub2d_11c, @function - .section .iram1 -dl_tie728_s16_sub2d_11c: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_2x_1 - # a7: c_left_x - - l32i a6, a5, 68 - l32i a7, a5, 72 - - beqz a6, dl_tie728_s16_sub2d_small_channel - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - loopgtz a6, 0f - EE.VLD.128.IP q2, a3, 16 - EE.VSUBS.S16.LD.INCP q3, a4, q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VSUBS.S16.LD.INCP q1, a4, q5, q2, q3 - EE.VST.128.IP q5, a2, 16 -0: - EE.VLD.128.IP q2, a3, 16 - EE.VSUBS.S16.LD.INCP q3, a4, q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - beqi a7, 1, 2f #remainder == 2*16byte - beqi a7, 2, 3f #remainder == 3*16byte - -2: - EE.VSUBS.S16 q5, q2, q3 - EE.VST.128.IP q5, a2, 16 - retw -3: - EE.VLD.128.IP q0, a3, 16 - EE.VSUBS.S16.LD.INCP q1, a4, q5, q2, q3 - EE.VST.128.IP q5, a2, 16 - - EE.VSUBS.S16 q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - retw - -dl_tie728_s16_sub2d_small_channel: - blti a7, 0, 5f - loopgtz a7, 1f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VSUBS.S16 q2, q0, q1 - EE.VST.128.IP q2, a2, 16 -1: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VSUBS.S16 q2, q0, q1 - EE.VST.128.IP q2, a2, 16 -5: - retw - - - - .align 4 - .text - .global dl_tie728_s16_rescale_sub2d_11c - .type dl_tie728_s16_rescale_sub2d_11c, @function - .section .iram1 -dl_tie728_s16_rescale_sub2d_11c: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a12: rescale_input - # a13: neg_output_scale_ptr - - l32i a6, a5, 64 - l32i a7, a5, 88 - l32i a8, a5, 96 - l32i a9, a5, 92 - l32i a12, a5, 80 - - beqi a8, 1, dl_tie728_s16_rescale_sub2d_output - -dl_tie728_s16_rescale_sub2d_output_scale: - s16i a8, a1, 0 - EE.VLDBC.16 q7, a1 # all output_scale - addi a13, a5, 104 - EE.VLDBC.16 q4, a13 # all neg_output_scale - - loopgtz a6, 1f - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_sub_rescale_output q0, q1, q1, q7, a9, q4, a12 - - EE.VST.128.IP q1, a2, 16 - -1: - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_sub_rescale_output q0, q1, q1, q7, a9, q4, a12 - - EE.VST.128.IP q1, a2, 16 - retw - -dl_tie728_s16_rescale_sub2d_output: - movi a13, -1 - s16i a13, a1, 0 - EE.VLDBC.16 q7, a1 # all -1 - - blti a12, 2, dl_tie728_s16_rescale_sub2d_output_0 -#input1 in the front - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - loopgtz a6, 3f - EE.SRCMB.S16.QACC q1, a7, 0 - EE.VMULAS.S16.QACC.LD.IP q0, a3, 16, q0, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.LDQA.S16.128.IP a4, 16 - EE.VST.128.IP q1, a2, 16 -3: - EE.SRCMB.S16.QACC q1, a7, 0 - EE.VMULAS.S16.QACC q0, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - - EE.VST.128.IP q1, a2, 16 - retw - -dl_tie728_s16_rescale_sub2d_output_0: #input0 in the front - EE.LDQA.S16.128.IP a4, 16 - loopgtz a6, 2f - EE.SRCMB.S16.QACC q1, a7, 0 - - EE.LDQA.S16.128.IP a3, 16 - EE.VMULAS.S16.QACC q1, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.LDQA.S16.128.IP a4, 16 - EE.VST.128.IP q1, a2, 16 -2: - EE.SRCMB.S16.QACC q1, a7, 0 - EE.LDQA.S16.128.IP a3, 16 - EE.VMULAS.S16.QACC q1, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - - EE.VST.128.IP q1, a2, 16 - retw - - - .align 4 - .text - .global dl_tie728_s16_sub2d_11c_relu - .type dl_tie728_s16_sub2d_11c_relu, @function - .section .iram1 -dl_tie728_s16_sub2d_11c_relu: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_2x_1 - # a7: c_left_x - # a14: activation_alpha - # a15: activation_shift - - l32i a6, a5, 68 - l32i a7, a5, 72 - l32i a14, a5, 52 - l32i a15, a5, 60 - - beqz a6, dl_tie728_s16_sub2d_small_channel_relu - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - loopgtz a6, 0f - EE.VLD.128.IP q2, a3, 16 - EE.VSUBS.S16.LD.INCP q3, a4, q4, q0, q1 - EE.VRELU.S16 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VSUBS.S16.LD.INCP q1, a4, q5, q2, q3 - EE.VRELU.S16 q5, a14, a15 - EE.VST.128.IP q5, a2, 16 -0: - EE.VLD.128.IP q2, a3, 16 - EE.VSUBS.S16.LD.INCP q3, a4, q4, q0, q1 - EE.VRELU.S16 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - beqi a7, 1, 2f #remainder == 2*16byte - beqi a7, 2, 3f #remainder == 3*16byte - -2: - EE.VSUBS.S16 q5, q2, q3 - EE.VRELU.S16 q5, a14, a15 - EE.VST.128.IP q5, a2, 16 - retw -3: - EE.VLD.128.IP q0, a3, 16 - EE.VSUBS.S16.LD.INCP q1, a4, q5, q2, q3 - EE.VRELU.S16 q5, a14, a15 - EE.VST.128.IP q5, a2, 16 - - EE.VSUBS.S16 q4, q0, q1 - EE.VRELU.S16 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - retw - -dl_tie728_s16_sub2d_small_channel_relu: - blti a7, 0, 5f - loopgtz a7, 1f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VSUBS.S16 q2, q0, q1 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 -1: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VSUBS.S16 q2, q0, q1 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 -5: - retw - - - - .align 4 - .text - .global dl_tie728_s16_rescale_sub2d_11c_relu - .type dl_tie728_s16_rescale_sub2d_11c_relu, @function - .section .iram1 -dl_tie728_s16_rescale_sub2d_11c_relu: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a12: rescale_input - # a14: activation_alpha - # a15: activation_shift - - l32i a6, a5, 64 - l32i a7, a5, 88 - l32i a8, a5, 96 - l32i a9, a5, 92 - l32i a10, a5, 76 - l32i a12, a5, 80 - l32i a14, a5, 52 - l32i a15, a5, 60 - srli a10, a10, 1 - - beqi a8, 1, dl_tie728_s16_rescale_sub2d_output_relu - -dl_tie728_s16_rescale_sub2d_output_scale_relu: - s16i a8, a1, 0 - EE.VLDBC.16 q7, a1 # all output_scale - addi a13, a5, 104 - EE.VLDBC.16 q4, a13 # all neg_output_scale - - loopgtz a6, 1f - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_sub_rescale_output q0, q1, q1, q7, a9, q4, a12 - - EE.VRELU.S16 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - -1: - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_sub_rescale_output q0, q1, q1, q7, a9, q4, a12 - - EE.VRELU.S16 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - retw - -dl_tie728_s16_rescale_sub2d_output_relu: - movi a13, -1 - s16i a13, a1, 0 - EE.VLDBC.16 q7, a1 # all -1 - - blti a12, 2, dl_tie728_s16_rescale_sub2d_output_relu_0 -#input1 in the front - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - loopgtz a6, 3f - EE.SRCMB.S16.QACC q1, a7, 0 - EE.VMULAS.S16.QACC.LD.IP q0, a3, 16, q0, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.LDQA.S16.128.IP a4, 16 - EE.VRELU.S16 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 -3: - EE.SRCMB.S16.QACC q1, a7, 0 - EE.VMULAS.S16.QACC q0, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.VRELU.S16 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - retw - -dl_tie728_s16_rescale_sub2d_output_relu_0: #input0 in the front - EE.LDQA.S16.128.IP a4, 16 - loopgtz a6, 2f - EE.SRCMB.S16.QACC q1, a7, 0 - - EE.LDQA.S16.128.IP a3, 16 - EE.VMULAS.S16.QACC q1, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.LDQA.S16.128.IP a4, 16 - EE.VRELU.S16 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 -2: - EE.SRCMB.S16.QACC q1, a7, 0 - EE.LDQA.S16.128.IP a3, 16 - EE.VMULAS.S16.QACC q1, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.VRELU.S16 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - retw - - - - .align 4 - .text - .global dl_tie728_s16_sub2d_11c_prelu - .type dl_tie728_s16_sub2d_11c_prelu, @function - .section .iram1 -dl_tie728_s16_sub2d_11c_prelu: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_2x_1 - # a7: c_left_x - # a14: activation_alpha_ptr - # a15: activation_shift - - l32i a6, a5, 68 - l32i a7, a5, 72 - l32i a8, a5, 76 - l32i a14, a5, 56 - l32i a15, a5, 60 - srli a8, a8, 1 - - beqz a6, dl_tie728_s16_sub2d_small_channel_prelu - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - loopgtz a6, 0f - EE.VLD.128.IP q2, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S16.LD.INCP q3, a4, q4, q0, q1 - EE.VPRELU.S16 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S16.LD.INCP q1, a4, q5, q2, q3 - EE.VPRELU.S16 q5, q5, q6, a15 - EE.VST.128.IP q5, a2, 16 -0: - EE.VLD.128.IP q2, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S16.LD.INCP q3, a4, q4, q0, q1 - EE.VPRELU.S16 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - beqi a7, 1, 2f #remainder == 2*16byte - beqi a7, 2, 3f #remainder == 3*16byte - -2: - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S16 q5, q2, q3 - EE.VPRELU.S16 q5, q5, q6, a15 - EE.VST.128.IP q5, a2, 16 - retw -3: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S16.LD.INCP q1, a4, q5, q2, q3 - EE.VPRELU.S16 q5, q5, q6, a15 - EE.VST.128.IP q5, a2, 16 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S16 q4, q0, q1 - EE.VPRELU.S16 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - retw - -dl_tie728_s16_sub2d_small_channel_prelu: - blti a7, 0, 5f - loopgtz a7, 1f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S16 q2, q0, q1 - EE.VPRELU.S16 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 -1: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S16 q2, q0, q1 - EE.VPRELU.S16 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 -5: - retw - - - - .align 4 - .text - .global dl_tie728_s16_rescale_sub2d_11c_prelu - .type dl_tie728_s16_rescale_sub2d_11c_prelu, @function - .section .iram1 -dl_tie728_s16_rescale_sub2d_11c_prelu: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a12: rescale_input - # a14: activation_alpha_ptr - # a15: activation_shift - - l32i a6, a5, 64 - l32i a7, a5, 88 - l32i a8, a5, 96 - l32i a9, a5, 92 - l32i a12, a5, 80 - l32i a14, a5, 56 - l32i a15, a5, 60 - - beqi a8, 1, dl_tie728_s16_rescale_sub2d_output_prelu - -dl_tie728_s16_rescale_sub2d_output_scale_prelu: - s16i a8, a1, 0 - EE.VLDBC.16 q7, a1 # all output_scale - addi a13, a5, 104 - EE.VLDBC.16 q4, a13 # all neg_output_scale - - loopgtz a6, 1f - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S16.QACC q1, a7, 0 - - EE.VLD.128.IP q6, a14, 16 - dl_tie728_s16_rescale_sub_rescale_output q0, q1, q1, q7, a9, q4, a12 - - EE.VPRELU.S16 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 - -1: - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S16.QACC q1, a7, 0 - - EE.VLD.128.IP q6, a14, 16 - dl_tie728_s16_rescale_sub_rescale_output q0, q1, q1, q7, a9, q4, a12 - - EE.VPRELU.S16 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 - retw - -dl_tie728_s16_rescale_sub2d_output_prelu: - movi a13, -1 - s16i a13, a1, 0 - EE.VLDBC.16 q7, a1 # all -1 - - blti a12, 2, dl_tie728_s16_rescale_sub2d_output_prelu_0 -#input1 in the front - EE.LDQA.S16.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - loopgtz a6, 3f - EE.SRCMB.S16.QACC q1, a7, 0 - EE.VMULAS.S16.QACC.LD.IP q0, a3, 16, q0, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.VLD.128.IP q6, a14, 16 - EE.LDQA.S16.128.IP a4, 16 - EE.VPRELU.S16 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 -3: - EE.SRCMB.S16.QACC q1, a7, 0 - EE.VMULAS.S16.QACC q0, q7 - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.VPRELU.S16 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 - retw - -dl_tie728_s16_rescale_sub2d_output_prelu_0: #input0 in the front - EE.LDQA.S16.128.IP a4, 16 - loopgtz a6, 2f - EE.SRCMB.S16.QACC q1, a7, 0 - - EE.LDQA.S16.128.IP a3, 16 - EE.VMULAS.S16.QACC q1, q7 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.VLD.128.IP q6, a14, 16 - EE.LDQA.S16.128.IP a4, 16 - EE.VPRELU.S16 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 -2: - EE.SRCMB.S16.QACC q1, a7, 0 - EE.LDQA.S16.128.IP a3, 16 - EE.VMULAS.S16.QACC q1, q7 - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S16.QACC q1, a9, 0 - EE.VPRELU.S16 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 - retw - - -############################################################################################################################################################ -#### -#### tie728_s16_unaligned_sub2d_11c series -#### -############################################################################################################################################################ - - .align 4 - .text - .global dl_tie728_s16_unaligned_sub2d_11c - .type dl_tie728_s16_unaligned_sub2d_11c, @function - .section .iram1 -dl_tie728_s16_unaligned_sub2d_11c: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a10: c_remainder - # a12: rescale_input - - l32i a6, a5, 64 - l32i a10, a5, 76 - l32i a7, a5, 88 - l32i a12, a5, 80 - - bgei a7, 0, dl_tie728_s16_unaligned_rescale_sub2d_11c - -# input0 exp = input1 exp = output exp - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie728_s16_unaligned_sub2d_11c_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie728_s16_unaligned_sub2d_11c_0 - beqi a13, 8, dl_tie728_s16_unaligned_sub2d_11c_1 - - - loopgtz a6, 0f #dl_tie728_s16_unaligned_sub2d_11c - EE.SRC.Q.QUP q2, q0, q1 - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.VSUBS.S16 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store0 q2, a2, a13 -0: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie728_s16_unaligned_sub2d_11c_remainder - - #output sar = 0 -dl_tie728_s16_unaligned_sub2d_11c_0: - loopgtz a6, 1f #dl_tie728_s16_unaligned_sub2d_11c_loop0 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VST.128.IP q2, a2, 16 -1: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - EE.VST.128.IP q2, a2, 16 - j dl_tie728_s16_unaligned_sub2d_11c_remainder - - # #output sar = 8 -dl_tie728_s16_unaligned_sub2d_11c_1: - loopgtz a6, 2f #dl_tie728_s16_unaligned_sub2d_11c_loop1 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store1 q2, a2 -2: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - dl_tie728_128b_unaligned_store1 q2, a2 - j dl_tie728_s16_unaligned_sub2d_11c_remainder - -dl_tie728_s16_unaligned_sub2d_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 - -dl_tie728_s16_unaligned_sub2d_11c_remainder: - - beqz a10, dl_tie728_s16_unaligned_sub2d_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - - srli a10, a10, 1 - dl_tie728_s16_store_remainder q2, a10, a13, a2 - -dl_tie728_s16_unaligned_sub2d_end: - - retw - - -## rescaled sub -dl_tie728_s16_unaligned_rescale_sub2d_11c: - l32i a8, a5, 96 # output_scale - l32i a9, a5, 92 # output_shift - - beqi a8, 1, dl_tie728_s16_rescale_unaligned_sub2d_output_shift - - -### rescaled to output by *scale) >> shift (left shift) -dl_tie728_s16_rescale_unaligned_sub2d_output_scale: - - s32i a8, a1, 0 - EE.VLDBC.16 q7, a1 # all output_scale - addi a13, a5, 104 - EE.VLDBC.16 q6, a13 - - blti a6, 0, dl_tie728_s16_rescale_unaligned_sub2d_scale_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 3f #dl_tie728_s16_rescale_unaligned_sub2d_11c_scale - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_sub_rescale_output q2, q1, q2, q7, a9, q6, a12 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store0 q2, a2, a11 - 3: - - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a6 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_sub_rescale_output q2, q1, q2, q7, a9, q6, a12 - - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie728_s16_rescale_unaligned_sub2d_scale_remainder - - -dl_tie728_s16_rescale_unaligned_sub2d_scale_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a6 #input1 sar - -dl_tie728_s16_rescale_unaligned_sub2d_scale_remainder: - beqz a10, dl_tie728_s16_unaligned_rescale_sub2d_output_scale_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a6 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_sub_rescale_output q2, q1, q2, q7, a9, q6, a12 - - srli a10, a10, 1 - dl_tie728_s16_store_remainder q2, a10, a12, a2 - - dl_tie728_s16_unaligned_rescale_sub2d_output_scale_end: - retw - - -### rescaled to output by right shift -dl_tie728_s16_rescale_unaligned_sub2d_output_shift: - movi a13, -1 - s16i a13, a1, 0 - EE.VLDBC.16 q7, a1 # all -1 - - blti a6, 0, dl_tie728_s16_rescale_unaligned_sub2d_shift_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 4f #dl_tie728_s16_rescale_unaligned_sub2d_11c_shift - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S16.QACC q5 - - EE.SRCMB.S16.QACC q5, a7, 0 # input1 >> shift - - blti a12, 2, 11f - EE.VMULAS.S16.QACC q2, q7 # input1 >> shift - input0 - j 12f - 11: - EE.MOV.S16.QACC q2 - EE.VMULAS.S16.QACC q5, q7 # input0 - input1 >> shift - 12: - EE.SRCMB.S16.QACC q5, a9, 0 - - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_128b_unaligned_store0 q5, a2, a13 - 4: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a6 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S16.QACC q2, q7 # input1 >> shift - input0 - j 12f - 11: - EE.MOV.S16.QACC q2 - EE.VMULAS.S16.QACC q5, q7 # input0 - input1 >> shift - 12: - - EE.SRCMB.S16.QACC q5, a9, 0 - - dl_tie728_128b_unaligned_store0 q5, a2, a13 - j dl_tie728_s16_rescale_unaligned_sub2d_shift_remainder - - - -dl_tie728_s16_rescale_unaligned_sub2d_shift_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a6 #input1 sar - -dl_tie728_s16_rescale_unaligned_sub2d_shift_remainder: - beqz a10, dl_tie728_s16_unaligned_rescale_sub2d_output_shift_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a6 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S16.QACC q2, q7 # input1 >> shift - input0 - j 12f - 11: - EE.MOV.S16.QACC q2 - EE.VMULAS.S16.QACC q5, q7 # input0 - input1 >> shift - 12: - EE.SRCMB.S16.QACC q5, a9, 0 - - srli a10, a10, 1 - dl_tie728_s16_store_remainder q5, a10, a13, a2 - -dl_tie728_s16_unaligned_rescale_sub2d_output_shift_end: - retw - - - -.align 4 - .text - .global dl_tie728_s16_unaligned_sub2d_11c_relu - .type dl_tie728_s16_unaligned_sub2d_11c_relu, @function - .section .iram1 -dl_tie728_s16_unaligned_sub2d_11c_relu: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a10: c_remainder - # a12: rescale_input - # a14: activation_alpha - # a15: activation_shift - - l32i a6, a5, 64 - l32i a10, a5, 76 - l32i a7, a5, 88 - l32i a12, a5, 80 - l32i a14, a5, 52 - l32i a15, a5, 60 - - bgei a7, 0, dl_tie728_s16_unaligned_rescale_sub2d_11c_relu - -# input0 exp = input1 exp = output exp - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie728_s16_unaligned_sub2d_11c_small_remainder_relu # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie728_s16_unaligned_sub2d_11c_relu_0 - beqi a13, 8, dl_tie728_s16_unaligned_sub2d_11c_relu_1 - - - loopgtz a6, 0f #dl_tie728_s16_unaligned_sub2d_11c_relu - EE.SRC.Q.QUP q2, q0, q1 - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.VSUBS.S16 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 -0: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie728_s16_unaligned_sub2d_11c_remainder_relu - - #output sar = 0 -dl_tie728_s16_unaligned_sub2d_11c_relu_0: - loopgtz a6, 1f #dl_tie728_s16_unaligned_sub2d_11c_loop0_relu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 -1: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - EE.VRELU.S16 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie728_s16_unaligned_sub2d_11c_remainder_relu - - # #output sar = 8 -dl_tie728_s16_unaligned_sub2d_11c_relu_1: - loopgtz a6, 2f #dl_tie728_s16_unaligned_sub2d_11c_loop1_relu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store1 q2, a2 -2: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store1 q2, a2 - j dl_tie728_s16_unaligned_sub2d_11c_remainder_relu - -dl_tie728_s16_unaligned_sub2d_11c_small_remainder_relu: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 - -dl_tie728_s16_unaligned_sub2d_11c_remainder_relu: - - beqz a10, dl_tie728_s16_unaligned_sub2d_end_relu - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - EE.VRELU.S16 q2, a14, a15 - srli a10, a10, 1 - dl_tie728_s16_store_remainder q2, a10, a13, a2 - -dl_tie728_s16_unaligned_sub2d_end_relu: - - retw - - -## rescaled sub -dl_tie728_s16_unaligned_rescale_sub2d_11c_relu: - l32i a8, a5, 96 # output_scale - l32i a9, a5, 92 # output_shift - - beqi a8, 1, dl_tie728_s16_rescale_unaligned_sub2d_output_shift_relu - - -### rescaled to output by *scale) >> shift -dl_tie728_s16_rescale_unaligned_sub2d_output_scale_relu: - - s32i a8, a1, 0 - EE.VLDBC.16 q7, a1 # all output_scale - addi a13, a5, 104 - EE.VLDBC.16 q6, a13 - - blti a6, 0, dl_tie728_s16_rescale_unaligned_sub2d_scale_small_remainder_relu # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 3f #dl_tie728_s16_rescale_unaligned_sub2d_11c_scale_relu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_sub_rescale_output q2, q1, q2, q7, a9, q6, a12 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a11 - 3: - - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a6 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_sub_rescale_output q2, q1, q2, q7, a9, q6, a12 - - EE.VRELU.S16 q2, a14, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie728_s16_rescale_unaligned_sub2d_scale_remainder_relu - - -dl_tie728_s16_rescale_unaligned_sub2d_scale_small_remainder_relu: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a6 #input1 sar - -dl_tie728_s16_rescale_unaligned_sub2d_scale_remainder_relu: - beqz a10, dl_tie728_s16_unaligned_rescale_sub2d_output_scale_end_relu # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a6 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_sub_rescale_output q2, q1, q2, q7, a9, q6, a12 - - EE.VRELU.S16 q2, a14, a15 - srli a10, a10, 1 - dl_tie728_s16_store_remainder q2, a10, a13, a2 - -dl_tie728_s16_unaligned_rescale_sub2d_output_scale_end_relu: - retw - - -### rescaled to output by right shift -dl_tie728_s16_rescale_unaligned_sub2d_output_shift_relu: - movi a13, -1 - s16i a13, a1, 0 - EE.VLDBC.16 q7, a1 # all -1 - - blti a6, 0, dl_tie728_s16_rescale_unaligned_sub2d_shift_small_remainder_relu # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 4f #dl_tie728_s16_rescale_unaligned_sub2d_11c_shift_relu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S16.QACC q5 - - EE.SRCMB.S16.QACC q5, a7, 0 # input1 >> shift - - blti a12, 2, 11f - EE.VMULAS.S16.QACC q2, q7 - j 12f - 11: - EE.MOV.S16.QACC q2 - EE.VMULAS.S16.QACC q5, q7 # input0 - input1 >> shift - 12: - - EE.SRCMB.S16.QACC q5, a9, 0 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S16 q5, a14, a15 - dl_tie728_128b_unaligned_store0 q5, a2, a13 - 4: - addi a3, a3, -16 - add a3, a3, a10 - - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a6 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S16.QACC q5 - - EE.SRCMB.S16.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S16.QACC q2, q7 - j 12f - - 11: - EE.MOV.S16.QACC q2 - EE.VMULAS.S16.QACC q5, q7 # input0 - input1 >> shift - 12: - - EE.SRCMB.S16.QACC q5, a9, 0 - - EE.VRELU.S16 q5, a14, a15 - dl_tie728_128b_unaligned_store0 q5, a2, a13 - j dl_tie728_s16_rescale_unaligned_sub2d_shift_remainder_relu - - -dl_tie728_s16_rescale_unaligned_sub2d_shift_small_remainder_relu: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a6 #input1 sar - -dl_tie728_s16_rescale_unaligned_sub2d_shift_remainder_relu: - beqz a10, dl_tie728_s16_unaligned_rescale_sub2d_output_shift_end_relu # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a6 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S16.QACC q2, q7 - j 12f - 11: - EE.MOV.S16.QACC q2 - EE.VMULAS.S16.QACC q5, q7 # input0 - input1 >> shift - 12: - EE.SRCMB.S16.QACC q5, a9, 0 - EE.VRELU.S16 q5, a14, a15 - srli a10, a10, 1 - dl_tie728_s16_store_remainder q5, a10, a13, a2 - -dl_tie728_s16_unaligned_rescale_sub2d_output_shift_end_relu: - retw - - - - -.align 4 - .text - .global dl_tie728_s16_unaligned_sub2d_11c_prelu - .type dl_tie728_s16_unaligned_sub2d_11c_prelu, @function - .section .iram1 -dl_tie728_s16_unaligned_sub2d_11c_prelu: - .align 4 - entry sp, 32 - - # a2: int16_t *output_ptr - # a3: int16_t *input0_ptr - # a4: int16_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a10: c_remainder - # a12: rescale_input - # a14: activation_alpha_ptr - # a15: activation_shift - - l32i a6, a5, 64 - l32i a10, a5, 76 - l32i a7, a5, 88 - l32i a12, a5, 80 - l32i a14, a5, 56 - l32i a15, a5, 60 - - bgei a7, 0, dl_tie728_s16_unaligned_rescale_sub2d_11c_prelu - -# input0 exp = input1 exp = output exp - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie728_s16_unaligned_sub2d_11c_small_remainder_prelu # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie728_s16_unaligned_sub2d_11c_prelu_0 - beqi a13, 8, dl_tie728_s16_unaligned_sub2d_11c_prelu_1 - - - loopgtz a6, 0f #dl_tie728_s16_unaligned_sub2d_11c_prelu - EE.SRC.Q.QUP q2, q0, q1 - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.VSUBS.S16 q2, q2, q5 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 -0: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S16 q2, q2, q5 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie728_s16_unaligned_sub2d_11c_remainder_prelu - - #output sar = 0 -dl_tie728_s16_unaligned_sub2d_11c_prelu_0: - loopgtz a6, 1f #dl_tie728_s16_unaligned_sub2d_11c_loop0_prelu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 -1: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S16 q2, q2, q5 - EE.VPRELU.S16 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie728_s16_unaligned_sub2d_11c_remainder_prelu - - # #output sar = 8 -dl_tie728_s16_unaligned_sub2d_11c_prelu_1: - loopgtz a6, 2f #dl_tie728_s16_unaligned_sub2d_11c_loop1_prelu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S16 q2, q2, q5 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store1 q2, a2 -2: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S16 q2, q2, q5 - EE.VPRELU.S16 q2, q2, q6, a15 - dl_tie728_128b_unaligned_store1 q2, a2 - j dl_tie728_s16_unaligned_sub2d_11c_remainder_prelu - -dl_tie728_s16_unaligned_sub2d_11c_small_remainder_prelu: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 - -dl_tie728_s16_unaligned_sub2d_11c_remainder_prelu: - - beqz a10, dl_tie728_s16_unaligned_sub2d_end_prelu - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S16 q2, q2, q5 - EE.VPRELU.S16 q2, q2, q6, a15 - srli a10, a10, 1 - dl_tie728_s16_store_remainder q2, a10, a13, a2 - -dl_tie728_s16_unaligned_sub2d_end_prelu: - - retw - - -## rescaled sub -dl_tie728_s16_unaligned_rescale_sub2d_11c_prelu: - l32i a8, a5, 96 # output_scale - l32i a9, a5, 92 # output_shift - - beqi a8, 1, dl_tie728_s16_rescale_unaligned_sub2d_output_shift_prelu - - -### rescaled to output by *scale) >> shift -dl_tie728_s16_rescale_unaligned_sub2d_output_scale_prelu: - - s32i a8, a1, 0 - EE.VLDBC.16 q7, a1 # all output_scale - addi a13, a5, 104 - EE.VLDBC.16 q6, a13 - - blti a6, 0, dl_tie728_s16_rescale_unaligned_sub2d_scale_small_remainder_prelu # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 3f #dl_tie728_s16_rescale_unaligned_sub2d_11c_scale_prelu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - dl_tie728_s16_rescale_sub_rescale_output q2, q1, q2, q7, a9, q6, a12 - EE.VLD.128.IP q5, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q2, q2, q5, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a11 - 3: - - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a6 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - EE.VLD.128.IP q5, a14, 16 - dl_tie728_s16_rescale_sub_rescale_output q2, q1, q2, q7, a9, q6, a12 - - EE.VPRELU.S16 q2, q2, q5, a15 - dl_tie728_128b_unaligned_store0 q2, a2, a13 - j dl_tie728_s16_rescale_unaligned_sub2d_scale_remainder_prelu - - -dl_tie728_s16_rescale_unaligned_sub2d_scale_small_remainder_prelu: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a6 #input1 sar - -dl_tie728_s16_rescale_unaligned_sub2d_scale_remainder_prelu: - beqz a10, dl_tie728_s16_unaligned_rescale_sub2d_output_scale_end_prelu # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a6 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q1, a7, 0 - - EE.VLD.128.IP q5, a14, 16 - dl_tie728_s16_rescale_sub_rescale_output q2, q1, q2, q7, a9, q6, a12 - - EE.VPRELU.S16 q2, q2, q5, a15 - srli a10, a10, 1 - dl_tie728_s16_store_remainder q2, a10, a13, a2 - -dl_tie728_s16_unaligned_rescale_sub2d_output_scale_end_prelu: - retw - - -### rescaled to output by right shift -dl_tie728_s16_rescale_unaligned_sub2d_output_shift_prelu: - movi a13, -1 - s16i a13, a1, 0 - EE.VLDBC.16 q7, a1 # all -1 - - blti a6, 0, dl_tie728_s16_rescale_unaligned_sub2d_shift_small_remainder_prelu # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 4f #dl_tie728_s16_rescale_unaligned_sub2d_11c_shift_prelu - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S16.QACC q5 - - EE.SRCMB.S16.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S16.QACC q2, q7 # input1 >> shift - input0 - j 12f - 11: - EE.MOV.S16.QACC q2 - EE.VMULAS.S16.QACC q5, q7 # input0 - input1 >> shift - 12: - - EE.SRCMB.S16.QACC q5, a9, 0 - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S16 q5, q5, q6, a15 - dl_tie728_128b_unaligned_store0 q5, a2, a13 - 4: - addi a3, a3, -16 - add a3, a3, a10 - - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a6 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S16.QACC q5 - - EE.SRCMB.S16.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S16.QACC q2, q7 # input1 >> shift - input0 - j 12f - 11: - EE.MOV.S16.QACC q2 - EE.VMULAS.S16.QACC q5, q7 # input0 - input1 >> shift - 12: - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S16.QACC q5, a9, 0 - - EE.VPRELU.S16 q5, q5, q6, a15 - dl_tie728_128b_unaligned_store0 q5, a2, a13 - j dl_tie728_s16_rescale_unaligned_sub2d_shift_remainder_prelu - - - -dl_tie728_s16_rescale_unaligned_sub2d_shift_small_remainder_prelu: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a6 #input1 sar - -dl_tie728_s16_rescale_unaligned_sub2d_shift_remainder_prelu: - beqz a10, dl_tie728_s16_unaligned_rescale_sub2d_output_shift_end_prelu # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a6 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S16.QACC q5 - EE.SRCMB.S16.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S16.QACC q2, q7 # input0 - input1 >> shift - j 12f - 11: - EE.MOV.S16.QACC q2 - EE.VMULAS.S16.QACC q5, q7 # input0 - input1 >> shift - 12: - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S16.QACC q5, a9, 0 - - EE.VPRELU.S16 q5, q5, q6, a15 - srli a10, a10, 1 - dl_tie728_s16_store_remainder q5, a10, a13, a2 - -dl_tie728_s16_unaligned_rescale_sub2d_output_shift_end_prelu: - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_unaligned.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_unaligned.S deleted file mode 100644 index 0fe313d7..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_unaligned.S +++ /dev/null @@ -1,371 +0,0 @@ -#pragma once - -############################################################################################################################################################ -# tie728_32b_aligned_vector series -############################################################################################################################################################ -.macro tie728_32b_aligned_vector_store output_v, output_ptr, temp - EE.MOVI.32.A \output_v, \temp, 0 - s32i \temp, \output_ptr, 0 - EE.MOVI.32.A \output_v, \temp, 1 - s32i \temp, \output_ptr, 4 - EE.MOVI.32.A \output_v, \temp, 2 - s32i \temp, \output_ptr, 8 - EE.MOVI.32.A \output_v, \temp, 3 - s32i \temp, \output_ptr, 12 - addi \output_ptr, \output_ptr, 16 -.endm - -.macro tie728_s16_32b_aligned_vector_shift_bias_relu_store output_v, output_ptr, mac_shift, bias_v, bias_ptr, activation_alpha, activation_shift, temp - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VADDS.S16 \output_v, \output_v, \bias_v # bias - EE.VRELU.S16 \output_v, \activation_alpha, \activation_shift # LeakyReLU - tie728_32b_aligned_vector_store \output_v, \output_ptr, \temp # store -.endm - -.macro tie728_s16_32b_aligned_vector_shift_bias_prelu_store output_v, output_ptr, mac_shift, bias_v, bias_ptr, activation_v, activation_alpha_ptr, activation_shift, temp - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 16 # load PReLU alph - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VADDS.S16 \output_v, \output_v, \bias_v # bias - EE.VPRELU.S16 \output_v, \output_v, \activation_v, \activation_shift # PReLU - tie728_32b_aligned_vector_store \output_v, \output_ptr, \temp # store -.endm - -.macro tie728_s16_32b_aligned_vector_shift_bias_store output_v, output_ptr, mac_shift, bias_v, bias_ptr, temp - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VADDS.S16 \output_v, \output_v, \bias_v # bias - tie728_32b_aligned_vector_store \output_v, \output_ptr, \temp # store -.endm - -.macro tie728_s16_32b_aligned_vector_shift_relu_store output_v, output_ptr, mac_shift, activation_alpha, activation_shift, temp - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VRELU.S16 \output_v, \activation_alpha, \activation_shift # LeakyReLU - tie728_32b_aligned_vector_store \output_v, \output_ptr, \temp # store -.endm - -.macro tie728_s16_32b_aligned_vector_shift_prelu_store output_v, output_ptr, mac_shift, activation_v, activation_alpha_ptr, activation_shift, temp - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 16 # load PReLU alph - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VPRELU.S16 \output_v, \output_v, \activation_v, \activation_shift # PReLU - tie728_32b_aligned_vector_store \output_v, \output_ptr, \temp # store -.endm - -.macro tie728_s16_32b_aligned_vector_shift_store output_v, output_ptr, mac_shift, temp - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - tie728_32b_aligned_vector_store \output_v, \output_ptr, \temp # store -.endm - - - - - - -############################################################################################################################################################ -# tie728_64b_aligned_vector series -############################################################################################################################################################ -.macro tie728_64b_aligned_vector_store output_v, output_ptr - EE.VST.L.64.IP \output_v, \output_ptr, 8 - EE.VST.H.64.IP \output_v, \output_ptr, 8 -.endm - -.macro tie728_s16_64b_aligned_vector_shift_bias_relu_store output_v, output_ptr, mac_shift, bias_v, bias_ptr, activation_alpha, activation_shift - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VADDS.S16 \output_v, \output_v, \bias_v # bias - EE.VRELU.S16 \output_v, \activation_alpha, \activation_shift # LeakyReLU - tie728_64b_aligned_vector_store \output_v, \output_ptr # store -.endm - -.macro tie728_s16_64b_aligned_vector_shift_bias_prelu_store output_v, output_ptr, mac_shift, bias_v, bias_ptr, activation_v, activation_alpha_ptr, activation_shift - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 16 # load PReLU alph - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VADDS.S16 \output_v, \output_v, \bias_v # bias - EE.VPRELU.S16 \output_v, \output_v, \activation_v, \activation_shift # PReLU - tie728_64b_aligned_vector_store \output_v, \output_ptr # store -.endm - -.macro tie728_s16_64b_aligned_vector_shift_bias_store output_v, output_ptr, mac_shift, bias_v, bias_ptr - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VADDS.S16 \output_v, \output_v, \bias_v # bias - tie728_64b_aligned_vector_store \output_v, \output_ptr # store -.endm - -.macro tie728_s16_64b_aligned_vector_shift_relu_store output_v, output_ptr, mac_shift, activation_alpha, activation_shift - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VRELU.S16 \output_v, \activation_alpha, \activation_shift # LeakyReLU - tie728_64b_aligned_vector_store \output_v, \output_ptr # store -.endm - -.macro tie728_s16_64b_aligned_vector_shift_prelu_store output_v, output_ptr, mac_shift, activation_v, activation_alpha_ptr, activation_shift - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 16 # load PReLU alph - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VPRELU.S16 \output_v, \output_v, \activation_v, \activation_shift # PReLU - tie728_64b_aligned_vector_store \output_v, \output_ptr # store -.endm - -.macro tie728_s16_64b_aligned_vector_shift_store output_v, output_ptr, mac_shift - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - tie728_64b_aligned_vector_store \output_v, \output_ptr # store -.endm - - - - -############################################################################################################################################################ -# tie728_128b_aligned_vector series -############################################################################################################################################################ -.macro tie728_128b_aligned_vector_store output_v, output_ptr - EE.VST.128.IP \output_v, \output_ptr, 16 -.endm - -.macro tie728_s16_128b_aligned_vector_shift_bias_relu_store output_v, output_ptr, mac_shift, bias_v, bias_ptr, activation_alpha, activation_shift - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VADDS.S16 \output_v, \output_v, \bias_v # bias - EE.VRELU.S16 \output_v, \activation_alpha, \activation_shift # LeakyReLU - tie728_128b_aligned_vector_store \output_v, \output_ptr # store -.endm - -.macro tie728_s16_128b_aligned_vector_shift_bias_prelu_store output_v, output_ptr, mac_shift, bias_v, bias_ptr, activation_v, activation_alpha_ptr, activation_shift - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 16 # load PReLU alph - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VADDS.S16 \output_v, \output_v, \bias_v # bias - EE.VPRELU.S16 \output_v, \output_v, \activation_v, \activation_shift # PReLU - tie728_128b_aligned_vector_store \output_v, \output_ptr # store -.endm - -.macro tie728_s16_128b_aligned_vector_shift_bias_store output_v, output_ptr, mac_shift, bias_v, bias_ptr - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VADDS.S16 \output_v, \output_v, \bias_v # bias - tie728_128b_aligned_vector_store \output_v, \output_ptr # store -.endm - -.macro tie728_s16_128b_aligned_vector_shift_relu_store output_v, output_ptr, mac_shift, activation_alpha, activation_shift - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VRELU.S16 \output_v, \activation_alpha, \activation_shift # LeakyReLU - tie728_128b_aligned_vector_store \output_v, \output_ptr # store -.endm - -.macro tie728_s16_128b_aligned_vector_shift_prelu_store output_v, output_ptr, mac_shift, activation_v, activation_alpha_ptr, activation_shift - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 16 # load PReLU alph - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VPRELU.S16 \output_v, \output_v, \activation_v, \activation_shift # PReLU - tie728_128b_aligned_vector_store \output_v, \output_ptr # store -.endm - -.macro tie728_s16_128b_aligned_vector_shift_store output_v, output_ptr, mac_shift - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - tie728_128b_aligned_vector_store \output_v, \output_ptr # store -.endm - -############################################################################################################################################################ -# tie728_element series -############################################################################################################################################################ -.macro tie728_s16_element_store output_ptr, output - clamps \output, \output, 15 - s16i \output, \output_ptr, 0 - addi \output_ptr, \output_ptr, 2 -.endm - -.macro tie728_s16_element_bias output, bias_ptr, bias - l16si \bias, \bias_ptr, 0 - addi \bias_ptr, \bias_ptr, 2 - add \output, \output, \bias -.endm - -.macro tie728_s16_element_relu output - bgez \output, 0f - movi \output, 0 -0: -.endm - -.macro tie728_s16_element_leakyrelu output, alpha - bgez \output, 0f - mull \output, \output, \alpha - sra \output, \output -0: -.endm - -.macro tie728_s16_element_prelu output, alpha_ptr, alpha - l16si \alpha, \alpha_ptr, 0 - addi \alpha_ptr, \alpha_ptr, 2 - - bgez \output, 0f - mull \output, \output, \alpha - sra \output, \output -0: -.endm - -.macro tie728_s16_element_shift_bias_relu_store output_ptr, mac_shift, bias_ptr, temp1, temp2 - EE.SRS.ACCX \temp1, \mac_shift, 0 # shift - tie728_s16_element_bias \temp1, \bias_ptr, \temp2 # bias - tie728_s16_element_relu \temp1 # relu - tie728_s16_element_store \output_ptr, \temp1 # store -.endm - -# DONNOT forget to set ssr before call this macro -.macro tie728_s16_element_shift_bias_leakyrelu_store output_ptr, mac_shift, bias_ptr, activation_alpha, temp1, temp2 - EE.SRS.ACCX \temp1, \mac_shift, 0 # shift - tie728_s16_element_bias \temp1, \bias_ptr, \temp2 # bias - tie728_s16_element_leakyrelu \temp1, \activation_alpha # leakyrelu - tie728_s16_element_store \output_ptr, \temp1 # store -.endm - -# DONNOT forget to set ssr before call this macro -.macro tie728_s16_element_shift_bias_prelu_store output_ptr, mac_shift, bias_ptr, activation_alpha_ptr, temp1, temp2 - EE.SRS.ACCX \temp1, \mac_shift, 0 # shift - tie728_s16_element_bias \temp1, \bias_ptr, \temp2 # bias - tie728_s16_element_prelu \temp1, \activation_alpha_ptr, \temp2 # prelu - tie728_s16_element_store \output_ptr, \temp1 # store -.endm - -.macro tie728_s16_element_shift_bias_store output_ptr, mac_shift, bias_ptr, temp1, temp2 - EE.SRS.ACCX \temp1, \mac_shift, 0 # shift - tie728_s16_element_bias \temp1, \bias_ptr, \temp2 # bias - tie728_s16_element_store \output_ptr, \temp1 # store -.endm - -.macro tie728_s16_element_shift_relu_store output_ptr, mac_shift, temp1 - EE.SRS.ACCX \temp1, \mac_shift, 0 # shift - tie728_s16_element_relu \temp1 # relu - tie728_s16_element_store \output_ptr, \temp1 # store -.endm - -# DONNOT forget to set ssr before call this macro -.macro tie728_s16_element_shift_leakyrelu_store output_ptr, mac_shift, activation_alpha, temp1 - EE.SRS.ACCX \temp1, \mac_shift, 0 # shift - tie728_s16_element_leakyrelu \temp1, \activation_alpha # leakyrelu - tie728_s16_element_store \output_ptr, \temp1 # store -.endm - -# DONNOT forget to set ssr before call this macro -.macro tie728_s16_element_shift_prelu_store output_ptr, mac_shift, activation_alpha_ptr, temp1, temp2 - EE.SRS.ACCX \temp1, \mac_shift, 0 # shift - tie728_s16_element_prelu \temp1, \activation_alpha_ptr, \temp2 # prelu - tie728_s16_element_store \output_ptr, \temp1 # store -.endm - -.macro tie728_s16_element_shift_store output_ptr, mac_shift, temp1 - EE.SRS.ACCX \temp1, \mac_shift, 0 # shift - tie728_s16_element_store \output_ptr, \temp1 # store -.endm - - -############################################################################################################################################################ -# tie728_s16_variable_vector series -############################################################################################################################################################ -.macro tie728_s16_variable_vector_store output_ptr, vector, number, temp - 7: - bbci \number, 2, 3f - bbci \number, 1, 5f - bbci \number, 0, 6f - - # number == 0x111 - EE.MOVI.32.A \vector, \temp, 0 - s32i \temp, \output_ptr, 0 - EE.MOVI.32.A \vector, \temp, 1 - s32i \temp, \output_ptr, 4 - EE.MOVI.32.A \vector, \temp, 2 - s32i \temp, \output_ptr, 8 - EE.MOVI.32.A \vector, \temp, 3 - s16i \temp, \output_ptr, 12 - j 0f - 6: - # number == 0x110 - EE.MOVI.32.A \vector, \temp, 0 - s32i \temp, \output_ptr, 0 - EE.MOVI.32.A \vector, \temp, 1 - s32i \temp, \output_ptr, 4 - EE.MOVI.32.A \vector, \temp, 2 - s32i \temp, \output_ptr, 8 - j 0f - 5: - # number == 0x10_ - bbci \number, 0, 4f - - # number == 0x101 - EE.MOVI.32.A \vector, \temp, 0 - s32i \temp, \output_ptr, 0 - EE.MOVI.32.A \vector, \temp, 1 - s32i \temp, \output_ptr, 4 - EE.MOVI.32.A \vector, \temp, 2 - s16i \temp, \output_ptr, 8 - j 0f - 4: - # number == 0x100 - EE.MOVI.32.A \vector, \temp, 0 - s32i \temp, \output_ptr, 0 - EE.MOVI.32.A \vector, \temp, 1 - s32i \temp, \output_ptr, 4 - j 0f - 3: - # number == 0x0__ - bbci \number, 1, 1f - bbci \number, 0, 2f - - # number == 0x011 - EE.MOVI.32.A \vector, \temp, 0 - s32i \temp, \output_ptr, 0 - EE.MOVI.32.A \vector, \temp, 1 - s16i \temp, \output_ptr, 4 - j 0f - 2: - # number == 0x010 - EE.MOVI.32.A \vector, \temp, 0 - s32i \temp, \output_ptr, 0 - j 0f - 1: - # number == 0x001 - EE.MOVI.32.A \vector, \temp, 0 - s16i \temp, \output_ptr, 0 - 0: -.endm - -.macro tie728_s16_variable_vector_shift_bias_relu_store output_v, output_ptr, mac_shift, bias_v, bias_ptr, activation_alpha, activation_shift, number, temp - EE.VLD.128.IP \bias_v, \bias_ptr, 0 # load bias - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VADDS.S16 \output_v, \output_v, \bias_v # bias - EE.VRELU.S16 \output_v, \activation_alpha, \activation_shift # LeakyReLU - tie728_s16_variable_vector_store \output_ptr, \output_v, \number, \temp # store - -.endm - -.macro tie728_s16_variable_vector_shift_bias_prelu_store output_v, output_ptr, mac_shift, bias_v, bias_ptr, activation_v, activation_alpha_ptr, activation_shift, number, temp - EE.VLD.128.IP \bias_v, \bias_ptr, 0 # load bias - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 0 # load PReLU alph - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VADDS.S16 \output_v, \output_v, \bias_v # bias - EE.VPRELU.S16 \output_v, \output_v, \activation_v, \activation_shift # PReLU - tie728_s16_variable_vector_store \output_ptr, \output_v, \number, \temp # store -.endm - -.macro tie728_s16_variable_vector_shift_bias_store output_v, output_ptr, mac_shift, bias_v, bias_ptr, number, temp - EE.VLD.128.IP \bias_v, \bias_ptr, 0 # load bias - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VADDS.S16 \output_v, \output_v, \bias_v # bias - tie728_s16_variable_vector_store \output_ptr, \output_v, \number, \temp # store -.endm - -.macro tie728_s16_variable_vector_shift_relu_store output_v, output_ptr, mac_shift, activation_alpha, activation_shift, number, temp - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VRELU.S16 \output_v, \activation_alpha, \activation_shift # LeakyReLU - tie728_s16_variable_vector_store \output_ptr, \output_v, \number, \temp # store -.endm - -.macro tie728_s16_variable_vector_shift_prelu_store output_v, output_ptr, mac_shift, activation_v, activation_alpha_ptr, activation_shift, number, temp - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 0 # load PReLU alph - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - EE.VPRELU.S16 \output_v, \output_v, \activation_v, \activation_shift # PReLU - tie728_s16_variable_vector_store \output_ptr, \output_v, \number, \temp # store -.endm - -.macro tie728_s16_variable_vector_shift_store output_v, output_ptr, mac_shift, number, temp - EE.SRCMB.S16.QACC \output_v, \mac_shift, 0 # QACC -> QR without round, roud operation is wrong in this instruction - tie728_s16_variable_vector_store \output_ptr, \output_v, \number, \temp # store -.endm \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_unaligned_conv2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_unaligned_conv2d.S deleted file mode 100644 index 76824620..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_unaligned_conv2d.S +++ /dev/null @@ -1,2651 +0,0 @@ -#include "dl_tie728_s16_unaligned.S" - -############################################################################################################################################################ -#### -#### tie728_s16_unaligned_conv2d_11cn series -#### -############################################################################################################################################################ -.macro tie728_s16_unaligned_conv2d_11c8 input_v, input_front, input_back, filter_v0, filter_v1, input_ptr, filter_ptr, c_div_x_1, c_remainder - # input_v: 8 input elements - # filter_v0: 8 filter elements - # filter_v1: 8 filter elements - # input_ptr: input_ptr - # filter_ptr: filter_ptr - # c_div_x_1: input_channel // 8 - 1 - - EE.LD.128.USAR.IP \input_front, \input_ptr, 16 - - blti \c_div_x_1, 0, 7f - EE.LD.128.USAR.IP \input_back, \input_ptr, 16 - EE.VLD.128.IP \filter_v0, \filter_ptr, 16 - EE.VLD.128.IP \filter_v1, \filter_ptr, 16 - loopgtz \c_div_x_1, 8f - EE.SRC.Q.QUP \input_v, \input_front, \input_back - EE.LD.128.USAR.IP \input_back, \input_ptr, 16 - - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 0 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 1 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 2 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 3 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 4 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 5 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 6 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 7 -8: - # last entire-128b - EE.SRC.Q.QUP \input_v, \input_front, \input_back - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 0 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 1 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 2 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 3 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 4 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 5 - EE.VSMULAS.S16.QACC \filter_v0, \input_v, 6 - EE.VSMULAS.S16.QACC \filter_v1, \input_v, 7 - beqz \c_remainder, 0f # jump to c_remainder == 0 -7: - # c_remainder - EE.LD.128.USAR.XP \input_back, \input_ptr, \c_remainder - EE.SRC.Q.QUP \input_v, \input_front, \input_back - EE.VLD.128.IP \filter_v0, \filter_ptr, 16 - - bbci \c_remainder, 3, 3f - # remainder == 0x1__0 - EE.VLD.128.IP \filter_v1, \filter_ptr, 16 - bbci \c_remainder, 2, 5f - # remainder == 0x11_0 - bbci \c_remainder, 1, 6f - # remainder == 0x1110, 7 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 0 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 1 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 2 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 3 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 4 - EE.VSMULAS.S16.QACC \filter_v1, \input_v, 5 - EE.VSMULAS.S16.QACC \filter_v0, \input_v, 6 - j 0f - -6: # remainder == 0x1100, 6 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 0 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 1 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 2 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 3 - EE.VSMULAS.S16.QACC \filter_v0, \input_v, 4 - EE.VSMULAS.S16.QACC \filter_v1, \input_v, 5 - j 0f - -5: # remainder == 0x10_0 - bbci \c_remainder, 1, 4f - # remainder == 0x1010, 5 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 0 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 1 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 2 - EE.VSMULAS.S16.QACC \filter_v1, \input_v, 3 - EE.VSMULAS.S16.QACC \filter_v0, \input_v, 4 - j 0f - -4: # remainder == 0x1000, 4 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 0 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v, 1 - EE.VSMULAS.S16.QACC \filter_v0, \input_v, 2 - EE.VSMULAS.S16.QACC \filter_v1, \input_v, 3 - j 0f - -3: # remainder == 0x0__0 - bbci \c_remainder, 2, 1f - # remainder == 0x01_0 - EE.VLD.128.IP \filter_v1, \filter_ptr, 16 - - bbci \c_remainder, 1, 2f - # remainder == 0x0110, 3 - EE.VSMULAS.S16.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v, 0 - EE.VSMULAS.S16.QACC \filter_v1, \input_v, 1 - EE.VSMULAS.S16.QACC \filter_v0, \input_v, 2 - j 0f - -2: # remainder == 0x0100, 2 - EE.VSMULAS.S16.QACC \filter_v0, \input_v, 0 - EE.VSMULAS.S16.QACC \filter_v1, \input_v, 1 - j 0f - -1: # remainder == 0x0010, 1 - EE.VSMULAS.S16.QACC \filter_v0, \input_v, 0 - -0: - addi \input_ptr, \input_ptr, -16 - -.endm - - - -.macro tie728_s16_unaligned_conv2d_11c1 input_v, input_front, input_back, filter_v, filter_front, filter_back, input_ptr, filter_ptr, c_div_x_1, c_remainder, temp, zero - # input_v: 8 input elements - # filter_v: 8 filter elements - # filter_v1: 8 filter elements - # input_ptr: input_ptr - # filter_ptr: filter_ptr - # c_div_x_1: input_channel // 8 - 1 - - EE.LD.128.USAR.IP \input_front, \input_ptr, 16 - EE.LD.128.USAR.IP \filter_front, \filter_ptr, 16 - - blti \c_div_x_1, 0, 7f # input_channel < 8 - - EE.LD.128.USAR.IP \input_back, \input_ptr, 16 - loopgtz \c_div_x_1, 8f - EE.SRC.Q.QUP \input_v, \input_front, \input_back - - EE.LD.128.USAR.IP \filter_back, \filter_ptr, 16 - EE.SRC.Q.QUP \filter_v, \filter_front, \filter_back - - EE.LD.128.USAR.IP \input_back, \input_ptr, 16 - EE.VMULAS.S16.ACCX \filter_v, \input_v - 8: - # last entire-128b - EE.SRC.Q.QUP \input_v, \input_front, \input_back - - EE.LD.128.USAR.IP \filter_back, \filter_ptr, 16 - EE.SRC.Q.QUP \filter_v, \filter_front, \filter_back - - EE.VMULAS.S16.ACCX \filter_v, \input_v - - beqz \c_remainder, 0f -7: - # c_remainder > 0 - EE.LD.128.USAR.XP \input_back, \input_ptr, \c_remainder - EE.SRC.Q.QUP \input_v, \input_front, \input_back - - EE.LD.128.USAR.XP \filter_back, \filter_ptr, \c_remainder - EE.SRC.Q.QUP \filter_v, \filter_front, \filter_back - - EE.SLCXXP.2Q \input_back, \input_v, \temp, \zero - EE.SLCXXP.2Q \filter_back, \filter_v, \temp, \zero - - EE.VMULAS.S16.ACCX \filter_v, \input_v -0: - addi \input_ptr, \input_ptr, -16 - addi \filter_ptr, \filter_ptr, -16 -.endm - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_11cn - .type dl_tie728_s16_unaligned_conv2d_11cn, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_11cn: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 64 # a8: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a9, a4, 96 # a9: n_div_x = output_channel / (vector_width / element_width) - - blti a9, 1, tie728_s16_unaligned_conv2d_11cn_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_11cn_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_11cn_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_11cn_64b - tie728_s16_unaligned_conv2d_11cn_32b: - tie728_s16_unaligned_conv2d_11cn_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_32b_aligned_vector_shift_store q0, a2, a8, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_32b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_n_remainder - - tie728_s16_unaligned_conv2d_11cn_64b: - tie728_s16_unaligned_conv2d_11cn_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_64b_aligned_vector_shift_store q0, a2, a8 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_64b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_n_remainder - - tie728_s16_unaligned_conv2d_11cn_128b: - tie728_s16_unaligned_conv2d_11cn_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_128b_aligned_vector_shift_store q0, a2, a8 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_11cn_n_remainder: - l32i a9, a4, 140 # a9: n_remainder - beqz a9, tie728_s16_unaligned_conv2d_11cn_n_remainder_end - - movi a10, 15 - sub a10, a10, a7 # a10: 15 - c_remainder - movi a11, 0 # a11: activation_shift = zero - - tie728_s16_unaligned_conv2d_11cn_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a10, a11 - tie728_s16_element_shift_store a2, a8, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_n_remainder_loop - - tie728_s16_unaligned_conv2d_11cn_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_11cn_relu - .type dl_tie728_s16_unaligned_conv2d_11cn_relu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_11cn_relu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 64 # a8: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a9, a4, 96 # a9: n_div_x = output_channel / (vector_width / element_width) - movi a10, 0 # a10: activation_alpha = zero - movi a11, 0 # a11: activation_shift = zero - - blti a9, 1, tie728_s16_unaligned_conv2d_11cn_relu_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_11cn_relu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_11cn_relu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_11cn_relu_64b - tie728_s16_unaligned_conv2d_11cn_relu_32b: - tie728_s16_unaligned_conv2d_11cn_relu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_32b_aligned_vector_shift_relu_store q0, a2, a8, a10, a11, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_relu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_relu_n_remainder - - tie728_s16_unaligned_conv2d_11cn_relu_64b: - tie728_s16_unaligned_conv2d_11cn_relu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_64b_aligned_vector_shift_relu_store q0, a2, a8, a10, a11 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_relu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_relu_n_remainder - - tie728_s16_unaligned_conv2d_11cn_relu_128b: - tie728_s16_unaligned_conv2d_11cn_relu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_128b_aligned_vector_shift_relu_store q0, a2, a8, a10, a11 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_relu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_11cn_relu_n_remainder: - l32i a9, a4, 140 # a9: n_remainder - beqz a9, tie728_s16_unaligned_conv2d_11cn_relu_n_remainder_end - - movi a10, 15 - sub a10, a10, a7 # a10: 15 - c_remainder - - tie728_s16_unaligned_conv2d_11cn_relu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a10, a11 - tie728_s16_element_shift_relu_store a2, a8, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_relu_n_remainder_loop - - tie728_s16_unaligned_conv2d_11cn_relu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_11cn_leakyrelu - .type dl_tie728_s16_unaligned_conv2d_11cn_leakyrelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_11cn_leakyrelu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 64 # a8: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a9, a4, 96 # a9: n_div_x = output_channel / (vector_width / element_width) - l32i a10, a4, 76 # a10: activation_alpha - l32i a11, a4, 84 # a11: activation_shift - - blti a9, 1, tie728_s16_unaligned_conv2d_11cn_leakyrelu_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_11cn_leakyrelu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_11cn_leakyrelu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_11cn_leakyrelu_64b - tie728_s16_unaligned_conv2d_11cn_leakyrelu_32b: - tie728_s16_unaligned_conv2d_11cn_leakyrelu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_32b_aligned_vector_shift_relu_store q0, a2, a8, a10, a11, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_leakyrelu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_leakyrelu_n_remainder - - tie728_s16_unaligned_conv2d_11cn_leakyrelu_64b: - tie728_s16_unaligned_conv2d_11cn_leakyrelu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_64b_aligned_vector_shift_relu_store q0, a2, a8, a10, a11 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_leakyrelu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_leakyrelu_n_remainder - - tie728_s16_unaligned_conv2d_11cn_leakyrelu_128b: - tie728_s16_unaligned_conv2d_11cn_leakyrelu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_128b_aligned_vector_shift_relu_store q0, a2, a8, a10, a11 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_leakyrelu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_11cn_leakyrelu_n_remainder: - l32i a9, a4, 140 # a9: n_remainder - beqz a9, tie728_s16_unaligned_conv2d_11cn_leakyrelu_n_remainder_end - - ssr a11 # ssr: activation_shift - movi a11, 15 - sub a11, a11, a7 # a11: 15 - c_remainder - movi a12, 0 # a12: zero - - tie728_s16_unaligned_conv2d_11cn_leakyrelu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a11, a12 - tie728_s16_element_shift_leakyrelu_store a2, a8, a10, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_leakyrelu_n_remainder_loop - - tie728_s16_unaligned_conv2d_11cn_leakyrelu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_11cn_prelu - .type dl_tie728_s16_unaligned_conv2d_11cn_prelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_11cn_prelu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 64 # a8: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a9, a4, 96 # a9: n_div_x = output_channel / (vector_width / element_width) - l32i a10, a4, 80 # a10: activation_alpha_ptr - l32i a11, a4, 84 # a11: activation_shift - - blti a9, 1, tie728_s16_unaligned_conv2d_11cn_prelu_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_11cn_prelu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_11cn_prelu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_11cn_prelu_64b - tie728_s16_unaligned_conv2d_11cn_prelu_32b: - tie728_s16_unaligned_conv2d_11cn_prelu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_32b_aligned_vector_shift_prelu_store q0, a2, a8, q1, a10, a11, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_prelu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_prelu_n_remainder - - tie728_s16_unaligned_conv2d_11cn_prelu_64b: - tie728_s16_unaligned_conv2d_11cn_prelu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_64b_aligned_vector_shift_prelu_store q0, a2, a8, q1, a10, a11 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_prelu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_prelu_n_remainder - - tie728_s16_unaligned_conv2d_11cn_prelu_128b: - tie728_s16_unaligned_conv2d_11cn_prelu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_128b_aligned_vector_shift_prelu_store q0, a2, a8, q1, a10, a11 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_prelu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_11cn_prelu_n_remainder: - l32i a9, a4, 140 # a9: n_remainder - beqz a9, tie728_s16_unaligned_conv2d_11cn_prelu_n_remainder_end - - ssr a11 # ssr: activation_shift - movi a11, 15 - sub a11, a11, a7 # a11: 15 - c_remainder - movi a12, 0 # a12: zero - - tie728_s16_unaligned_conv2d_11cn_prelu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a11, a12 - tie728_s16_element_shift_prelu_store a2, a8, a10, a14, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_prelu_n_remainder_loop - - tie728_s16_unaligned_conv2d_11cn_prelu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_11cn_bias - .type dl_tie728_s16_unaligned_conv2d_11cn_bias, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_11cn_bias: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 64 # a8: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a9, a4, 96 # a9: n_div_x = output_channel / (vector_width / element_width) - l32i a10, a4, 68 # a10: bias_ptr - - blti a9, 1, tie728_s16_unaligned_conv2d_11cn_bias_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_11cn_bias_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_11cn_bias_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_11cn_bias_64b - tie728_s16_unaligned_conv2d_11cn_bias_32b: - tie728_s16_unaligned_conv2d_11cn_bias_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_32b_aligned_vector_shift_bias_store q0, a2, a8, q1, a10, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_32b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_bias_n_remainder - - tie728_s16_unaligned_conv2d_11cn_bias_64b: - tie728_s16_unaligned_conv2d_11cn_bias_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_64b_aligned_vector_shift_bias_store q0, a2, a8, q1, a10 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_64b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_bias_n_remainder - - tie728_s16_unaligned_conv2d_11cn_bias_128b: - tie728_s16_unaligned_conv2d_11cn_bias_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_128b_aligned_vector_shift_bias_store q0, a2, a8, q1, a10 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_11cn_bias_n_remainder: - l32i a9, a4, 140 # a9: n_remainder - beqz a9, tie728_s16_unaligned_conv2d_11cn_bias_n_remainder_end - - movi a11, 15 - sub a11, a11, a7 # a11: 15 - c_remainder - movi a12, 0 # a12: zero - - tie728_s16_unaligned_conv2d_11cn_bias_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a11, a12 - tie728_s16_element_shift_bias_store a2, a8, a10, a14, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_n_remainder_loop - - tie728_s16_unaligned_conv2d_11cn_bias_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_11cn_bias_relu - .type dl_tie728_s16_unaligned_conv2d_11cn_bias_relu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_11cn_bias_relu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 64 # a8: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a9, a4, 96 # a9: n_div_x = output_channel / (vector_width / element_width) - l32i a10, a4, 68 # a10: bias_ptr - movi a11, 0 # a11: activation_alpha = zero - movi a12, 0 # a12: activation_shift = zero - - blti a9, 1, tie728_s16_unaligned_conv2d_11cn_bias_relu_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_11cn_bias_relu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_11cn_bias_relu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_11cn_bias_relu_64b - tie728_s16_unaligned_conv2d_11cn_bias_relu_32b: - tie728_s16_unaligned_conv2d_11cn_bias_relu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_32b_aligned_vector_shift_bias_relu_store q0, a2, a8, q1, a10, a11, a12, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_relu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_bias_relu_n_remainder - - tie728_s16_unaligned_conv2d_11cn_bias_relu_64b: - tie728_s16_unaligned_conv2d_11cn_bias_relu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_64b_aligned_vector_shift_bias_relu_store q0, a2, a8, q1, a10, a11, a12 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_relu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_bias_relu_n_remainder - - tie728_s16_unaligned_conv2d_11cn_bias_relu_128b: - tie728_s16_unaligned_conv2d_11cn_bias_relu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_128b_aligned_vector_shift_bias_relu_store q0, a2, a8, q1, a10, a11, a12 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_relu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_11cn_bias_relu_n_remainder: - l32i a9, a4, 140 # a9: n_remainder - beqz a9, tie728_s16_unaligned_conv2d_11cn_bias_relu_n_remainder_end - - movi a11, 15 - sub a11, a11, a7 # a11: 15 - c_remainder - - tie728_s16_unaligned_conv2d_11cn_bias_relu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a11, a12 - tie728_s16_element_shift_bias_relu_store a2, a8, a10, a13, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_relu_n_remainder_loop - - tie728_s16_unaligned_conv2d_11cn_bias_relu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu - .type dl_tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 64 # a8: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a9, a4, 96 # a9: n_div_x = output_channel / (vector_width / element_width) - l32i a10, a4, 68 # a10: bias_ptr - l32i a11, a4, 76 # a11: activation_alpha - l32i a12, a4, 84 # a12: activation_shift - - blti a9, 1, tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_64b - tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_32b: - tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_32b_aligned_vector_shift_bias_relu_store q0, a2, a8, q1, a10, a11, a12, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder - - tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_64b: - tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_64b_aligned_vector_shift_bias_relu_store q0, a2, a8, q1, a10, a11, a12 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder - - tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_128b: - tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_128b_aligned_vector_shift_bias_relu_store q0, a2, a8, q1, a10, a11, a12 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder: - l32i a9, a4, 140 # a9: n_remainder - beqz a9, tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder_end - - ssr a12 # ssr: activation_shift - movi a12, 15 - sub a12, a12, a7 # a12: 15 - c_remainder - movi a13, 0 # a13: zero - - tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a12, a13 - tie728_s16_element_shift_bias_leakyrelu_store a2, a8, a10, a11, a14, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder_loop - - tie728_s16_unaligned_conv2d_11cn_bias_leakyrelu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_11cn_bias_prelu - .type dl_tie728_s16_unaligned_conv2d_11cn_bias_prelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_11cn_bias_prelu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 64 # a8: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a9, a4, 96 # a9: n_div_x = output_channel / (vector_width / element_width) - l32i a10, a4, 68 # a10: bias_ptr - l32i a11, a4, 80 # a11: activation_alpha_ptr - l32i a12, a4, 84 # a12: activation_shift - - blti a9, 1, tie728_s16_unaligned_conv2d_11cn_bias_prelu_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_11cn_bias_prelu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_11cn_bias_prelu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_11cn_bias_prelu_64b - tie728_s16_unaligned_conv2d_11cn_bias_prelu_32b: - tie728_s16_unaligned_conv2d_11cn_bias_prelu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_32b_aligned_vector_shift_bias_prelu_store q0, a2, a8, q1, a10, q2, a11, a12, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_prelu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_bias_prelu_n_remainder - - tie728_s16_unaligned_conv2d_11cn_bias_prelu_64b: - tie728_s16_unaligned_conv2d_11cn_bias_prelu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_64b_aligned_vector_shift_bias_prelu_store q0, a2, a8, q1, a10, q2, a11, a12 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_prelu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_11cn_bias_prelu_n_remainder - - tie728_s16_unaligned_conv2d_11cn_bias_prelu_128b: - tie728_s16_unaligned_conv2d_11cn_bias_prelu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_11c8 q0, q1, q2, q3, q4, a15, a5, a6, a7 - tie728_s16_128b_aligned_vector_shift_bias_prelu_store q0, a2, a8, q1, a10, q2, a11, a12 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_prelu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_11cn_bias_prelu_n_remainder: - l32i a9, a4, 140 # a9: n_remainder - beqz a9, tie728_s16_unaligned_conv2d_11cn_bias_prelu_n_remainder_end - - ssr a12 # ssr: activation_shift - movi a12, 15 - sub a12, a12, a7 # a12: 15 - c_remainder - movi a13, 0 # a13: zero - - tie728_s16_unaligned_conv2d_11cn_bias_prelu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_11c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a12, a13 - tie728_s16_element_shift_bias_prelu_store a2, a8, a10, a11, a14, a15 - - addi a9, a9, -1 - bnez a9, tie728_s16_unaligned_conv2d_11cn_bias_prelu_n_remainder_loop - - tie728_s16_unaligned_conv2d_11cn_bias_prelu_n_remainder_end: - - retw - - - - - - - - - - - - - - -############################################################################################################################################################ -#### -#### tie728_s16_unaligned_conv2d_33cn series -#### -############################################################################################################################################################ -.macro tie728_s16_unaligned_conv2d_33c8 input_v0, input_front, input_back, filter_v0, filter_v1, input_ptr, filter_ptr, c_div_x_1, c_remainder, dilation_x_offset, dilation_y_offset - tie728_s16_unaligned_conv2d_11c8 \input_v0, \input_front, \input_back, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_unaligned_conv2d_11c8 \input_v0, \input_front, \input_back, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_unaligned_conv2d_11c8 \input_v0, \input_front, \input_back, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder - add \input_ptr, \input_ptr, \dilation_y_offset - - tie728_s16_unaligned_conv2d_11c8 \input_v0, \input_front, \input_back, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_unaligned_conv2d_11c8 \input_v0, \input_front, \input_back, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_unaligned_conv2d_11c8 \input_v0, \input_front, \input_back, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder - add \input_ptr, \input_ptr, \dilation_y_offset - - tie728_s16_unaligned_conv2d_11c8 \input_v0, \input_front, \input_back, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_unaligned_conv2d_11c8 \input_v0, \input_front, \input_back, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_unaligned_conv2d_11c8 \input_v0, \input_front, \input_back, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder - # add \input_ptr, \input_ptr, \dilation_y_offset -.endm - -.macro tie728_s16_unaligned_conv2d_33c1 input_v0, input_front, input_back, filter_v0, filter_front, filter_back, input_ptr, filter_ptr, c_div_x_1, c_remainder, dilation_x_offset, dilation_y_offset, temp, zero - tie728_s16_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \temp, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \temp, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \temp, \zero - add \input_ptr, \input_ptr, \dilation_y_offset - - tie728_s16_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \temp, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \temp, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \temp, \zero - add \input_ptr, \input_ptr, \dilation_y_offset - - tie728_s16_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \temp, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \temp, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s16_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \temp, \zero - # add \input_ptr, \input_ptr, \dilation_y_offset -.endm - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_33cn - .type dl_tie728_s16_unaligned_conv2d_33cn, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_33cn: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 96 # a11: n_div_x = output_channel / (vector_width / element_width) - - blti a11, 1, tie728_s16_unaligned_conv2d_33cn_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_33cn_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_33cn_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_33cn_64b - tie728_s16_unaligned_conv2d_33cn_32b: - tie728_s16_unaligned_conv2d_33cn_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_32b_aligned_vector_shift_store q0, a2, a10, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_32b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_n_remainder - - tie728_s16_unaligned_conv2d_33cn_64b: - tie728_s16_unaligned_conv2d_33cn_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_64b_aligned_vector_shift_store q0, a2, a10 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_64b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_n_remainder - - tie728_s16_unaligned_conv2d_33cn_128b: - tie728_s16_unaligned_conv2d_33cn_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_128b_aligned_vector_shift_store q0, a2, a10 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_33cn_n_remainder: - l32i a11, a4, 140 # a11: n_remainder - beqz a11, tie728_s16_unaligned_conv2d_33cn_n_remainder_end - - movi a12, 15 - sub a12, a12, a7 # a12: 15 - c_remainder - movi a13, 0 # a13: activation_shift = zero - - tie728_s16_unaligned_conv2d_33cn_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a12, a13 - tie728_s16_element_shift_store a2, a10, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_n_remainder_loop - - tie728_s16_unaligned_conv2d_33cn_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_33cn_relu - .type dl_tie728_s16_unaligned_conv2d_33cn_relu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_33cn_relu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 96 # a11: n_div_x = output_channel / (vector_width / element_width) - movi a12, 0 # a12: activation_alpha = zero - movi a13, 0 # a13: activation_shift = zero - - blti a11, 1, tie728_s16_unaligned_conv2d_33cn_relu_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_33cn_relu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_33cn_relu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_33cn_relu_64b - tie728_s16_unaligned_conv2d_33cn_relu_32b: - tie728_s16_unaligned_conv2d_33cn_relu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_32b_aligned_vector_shift_relu_store q0, a2, a10, a12, a13, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_relu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_relu_n_remainder - - tie728_s16_unaligned_conv2d_33cn_relu_64b: - tie728_s16_unaligned_conv2d_33cn_relu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_64b_aligned_vector_shift_relu_store q0, a2, a10, a12, a13 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_relu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_relu_n_remainder - - tie728_s16_unaligned_conv2d_33cn_relu_128b: - tie728_s16_unaligned_conv2d_33cn_relu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_128b_aligned_vector_shift_relu_store q0, a2, a10, a12, a13 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_relu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_33cn_relu_n_remainder: - l32i a11, a4, 140 # a11: n_remainder - beqz a11, tie728_s16_unaligned_conv2d_33cn_relu_n_remainder_end - - movi a12, 15 - sub a12, a12, a7 # a12: 15 - c_remainder - - tie728_s16_unaligned_conv2d_33cn_relu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a12, a13 - tie728_s16_element_shift_relu_store a2, a10, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_relu_n_remainder_loop - - tie728_s16_unaligned_conv2d_33cn_relu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_33cn_leakyrelu - .type dl_tie728_s16_unaligned_conv2d_33cn_leakyrelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_33cn_leakyrelu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 96 # a11: n_div_x = output_channel / (vector_width / element_width) - l32i a12, a4, 76 # a12: activation_alpha - l32i a13, a4, 84 # a13: activation_shift - - blti a11, 1, tie728_s16_unaligned_conv2d_33cn_leakyrelu_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_33cn_leakyrelu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_33cn_leakyrelu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_33cn_leakyrelu_64b - tie728_s16_unaligned_conv2d_33cn_leakyrelu_32b: - tie728_s16_unaligned_conv2d_33cn_leakyrelu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_32b_aligned_vector_shift_relu_store q0, a2, a10, a12, a13, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_leakyrelu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_leakyrelu_n_remainder - - tie728_s16_unaligned_conv2d_33cn_leakyrelu_64b: - tie728_s16_unaligned_conv2d_33cn_leakyrelu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_64b_aligned_vector_shift_relu_store q0, a2, a10, a12, a13 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_leakyrelu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_leakyrelu_n_remainder - - tie728_s16_unaligned_conv2d_33cn_leakyrelu_128b: - tie728_s16_unaligned_conv2d_33cn_leakyrelu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_128b_aligned_vector_shift_relu_store q0, a2, a10, a12, a13 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_leakyrelu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_33cn_leakyrelu_n_remainder: - l32i a11, a4, 140 # a11: n_remainder - beqz a11, tie728_s16_unaligned_conv2d_33cn_leakyrelu_n_remainder_end - - ssr a13 # ssr: activation_shift - movi a13, 15 - sub a13, a13, a7 # a13: 15 - c_remainder - movi a14, 0 # a14: zero - - tie728_s16_unaligned_conv2d_33cn_leakyrelu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a13, a14 - tie728_s16_element_shift_leakyrelu_store a2, a10, a12, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_leakyrelu_n_remainder_loop - - tie728_s16_unaligned_conv2d_33cn_leakyrelu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_33cn_prelu - .type dl_tie728_s16_unaligned_conv2d_33cn_prelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_33cn_prelu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 96 # a11: n_div_x = output_channel / (vector_width / element_width) - l32i a12, a4, 80 # a12: activation_alpha_ptr - l32i a13, a4, 84 # a13: activation_shift - - blti a11, 1, tie728_s16_unaligned_conv2d_33cn_prelu_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_33cn_prelu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_33cn_prelu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_33cn_prelu_64b - tie728_s16_unaligned_conv2d_33cn_prelu_32b: - tie728_s16_unaligned_conv2d_33cn_prelu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_32b_aligned_vector_shift_prelu_store q0, a2, a10, q1, a12, a13, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_prelu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_prelu_n_remainder - - tie728_s16_unaligned_conv2d_33cn_prelu_64b: - tie728_s16_unaligned_conv2d_33cn_prelu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_64b_aligned_vector_shift_prelu_store q0, a2, a10, q1, a12, a13 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_prelu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_prelu_n_remainder - - tie728_s16_unaligned_conv2d_33cn_prelu_128b: - tie728_s16_unaligned_conv2d_33cn_prelu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_128b_aligned_vector_shift_prelu_store q0, a2, a10, q1, a12, a13 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_prelu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_33cn_prelu_n_remainder: - l32i a11, a4, 140 # a11: n_remainder - beqz a11, tie728_s16_unaligned_conv2d_33cn_prelu_n_remainder_end - - ssr a13 # ssr: activation_shift - movi a13, 15 - sub a13, a13, a7 # a13: 15 - c_remainder - - tie728_s16_unaligned_conv2d_33cn_prelu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - movi a14, 0 # a14: zero - tie728_s16_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a13, a14 - tie728_s16_element_shift_prelu_store a2, a10, a12, a14, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_prelu_n_remainder_loop - - tie728_s16_unaligned_conv2d_33cn_prelu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_33cn_bias - .type dl_tie728_s16_unaligned_conv2d_33cn_bias, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_33cn_bias: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 96 # a11: n_div_x = output_channel / (vector_width / element_width) - l32i a12, a4, 68 # a12: bias_ptr - - blti a11, 1, tie728_s16_unaligned_conv2d_33cn_bias_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_33cn_bias_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_33cn_bias_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_33cn_bias_64b - tie728_s16_unaligned_conv2d_33cn_bias_32b: - tie728_s16_unaligned_conv2d_33cn_bias_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_32b_aligned_vector_shift_bias_store q0, a2, a10, q1, a12, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_32b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_bias_n_remainder - - tie728_s16_unaligned_conv2d_33cn_bias_64b: - tie728_s16_unaligned_conv2d_33cn_bias_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_64b_aligned_vector_shift_bias_store q0, a2, a10, q1, a12 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_64b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_bias_n_remainder - - tie728_s16_unaligned_conv2d_33cn_bias_128b: - tie728_s16_unaligned_conv2d_33cn_bias_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_128b_aligned_vector_shift_bias_store q0, a2, a10, q1, a12 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_33cn_bias_n_remainder: - l32i a11, a4, 140 # a11: n_remainder - beqz a11, tie728_s16_unaligned_conv2d_33cn_bias_n_remainder_end - - movi a13, 15 - sub a13, a13, a7 # a13: 15 - c_remainder - - tie728_s16_unaligned_conv2d_33cn_bias_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - movi a14, 0 # a14: zero - tie728_s16_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a13, a14 - tie728_s16_element_shift_bias_store a2, a10, a12, a14, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_n_remainder_loop - - tie728_s16_unaligned_conv2d_33cn_bias_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_33cn_bias_relu - .type dl_tie728_s16_unaligned_conv2d_33cn_bias_relu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_33cn_bias_relu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 96 # a11: n_div_x = output_channel / (vector_width / element_width) - l32i a12, a4, 68 # a12: bias_ptr - movi a13, 0 # a13: activation_alpha = zero - movi a14, 0 # a14: activation_shift = zero - - blti a11, 1, tie728_s16_unaligned_conv2d_33cn_bias_relu_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_33cn_bias_relu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_33cn_bias_relu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_33cn_bias_relu_64b - tie728_s16_unaligned_conv2d_33cn_bias_relu_32b: - tie728_s16_unaligned_conv2d_33cn_bias_relu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_32b_aligned_vector_shift_bias_relu_store q0, a2, a10, q1, a12, a13, a14, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_relu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_bias_relu_n_remainder - - tie728_s16_unaligned_conv2d_33cn_bias_relu_64b: - tie728_s16_unaligned_conv2d_33cn_bias_relu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_64b_aligned_vector_shift_bias_relu_store q0, a2, a10, q1, a12, a13, a14 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_relu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_bias_relu_n_remainder - - tie728_s16_unaligned_conv2d_33cn_bias_relu_128b: - tie728_s16_unaligned_conv2d_33cn_bias_relu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_128b_aligned_vector_shift_bias_relu_store q0, a2, a10, q1, a12, a13, a14 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_relu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_33cn_bias_relu_n_remainder: - l32i a11, a4, 140 # a11: n_remainder - beqz a11, tie728_s16_unaligned_conv2d_33cn_bias_relu_n_remainder_end - - movi a13, 15 - sub a13, a13, a7 # a13: 15 - c_remainder - - tie728_s16_unaligned_conv2d_33cn_bias_relu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - movi a14, 0 # a14: zero - tie728_s16_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a13, a14 - tie728_s16_element_shift_bias_relu_store a2, a10, a12, a14, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_relu_n_remainder_loop - - tie728_s16_unaligned_conv2d_33cn_bias_relu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu - .type dl_tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 96 # a11: n_div_x = output_channel / (vector_width / element_width) - l32i a12, a4, 68 # a12: bias_ptr - l32i a13, a4, 76 # a13: activation_alpha - l32i a14, a4, 84 # a14: activation_shift - - blti a11, 1, tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_64b - tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_32b: - tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_32b_aligned_vector_shift_bias_relu_store q0, a2, a10, q1, a12, a13, a14, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder - - tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_64b: - tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_64b_aligned_vector_shift_bias_relu_store q0, a2, a10, q1, a12, a13, a14 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder - - tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_128b: - tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_128b_aligned_vector_shift_bias_relu_store q0, a2, a10, q1, a12, a13, a14 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder: - l32i a11, a4, 140 # a11: n_remainder - beqz a11, tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder_end - - movi a10, 15 - sub a10, a10, a7 # a10: 15 - c_remainder - ssr a14 # ssr: activation_shift - - tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - l32i a9, a4, 112 # a9: dilation_y_offset - movi a14, 0 # a14: zero - tie728_s16_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a10, a14 - - l32i a9, a4, 64 # a9: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_element_shift_bias_leakyrelu_store a2, a9, a12, a13, a14, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder_loop - - tie728_s16_unaligned_conv2d_33cn_bias_leakyrelu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_33cn_bias_prelu - .type dl_tie728_s16_unaligned_conv2d_33cn_bias_prelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_33cn_bias_prelu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args, 15 - input_channel % (vector_width / element_width) * sizeof(feature_t) - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 96 # a11: n_div_x = output_channel / (vector_width / element_width) - l32i a12, a4, 68 # a12: bias_ptr - l32i a13, a4, 80 # a13: activation_alpha_ptr - l32i a14, a4, 84 # a14: activation_shift - - blti a11, 1, tie728_s16_unaligned_conv2d_33cn_bias_prelu_n_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_33cn_bias_prelu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_33cn_bias_prelu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_33cn_bias_prelu_64b - tie728_s16_unaligned_conv2d_33cn_bias_prelu_32b: - tie728_s16_unaligned_conv2d_33cn_bias_prelu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_32b_aligned_vector_shift_bias_prelu_store q0, a2, a10, q1, a12, q2, a13, a14, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_prelu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_bias_prelu_n_remainder - - tie728_s16_unaligned_conv2d_33cn_bias_prelu_64b: - tie728_s16_unaligned_conv2d_33cn_bias_prelu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_64b_aligned_vector_shift_bias_prelu_store q0, a2, a10, q1, a12, q2, a13, a14 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_prelu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_33cn_bias_prelu_n_remainder - - tie728_s16_unaligned_conv2d_33cn_bias_prelu_128b: - tie728_s16_unaligned_conv2d_33cn_bias_prelu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_33c8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9 - tie728_s16_128b_aligned_vector_shift_bias_prelu_store q0, a2, a10, q1, a12, q2, a13, a14 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_prelu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_33cn_bias_prelu_n_remainder: - l32i a11, a4, 140 # a11: n_remainder - beqz a11, tie728_s16_unaligned_conv2d_33cn_bias_prelu_n_remainder_end - - movi a10, 15 - sub a10, a10, a7 # a10: 15 - c_remainder - ssr a14 # ssr: activation_shift - - tie728_s16_unaligned_conv2d_33cn_bias_prelu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - l32i a9, a4, 112 # a9: dilation_y_offset - movi a14, 0 # a14: zero - tie728_s16_unaligned_conv2d_33c1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a10, a14 - - l32i a9, a4, 64 # a9: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_element_shift_bias_prelu_store a2, a9, a12, a13, a14, a15 - - addi a11, a11, -1 - bnez a11, tie728_s16_unaligned_conv2d_33cn_bias_prelu_n_remainder_loop - - tie728_s16_unaligned_conv2d_33cn_bias_prelu_n_remainder_end: - - retw - - - - - - - - - -############################################################################################################################################################ -#### -#### tie728_s16_unaligned_conv2d_hwcn series -#### -############################################################################################################################################################ -.macro tie728_s16_unaligned_conv2d_hwc8 input_v0, input_front, input_back, filter_v0, filter_v1, input_ptr, filter_ptr, c_div_x_1, c_remainder, dilation_x_offset, dilation_y_offset, filter_h, filter_w, args, filter_offset_q - l32i \filter_h, \args, 52 # filter_height - 10: - l32i \filter_w, \args, 56 # filter_width - beqi \filter_w, 1, 11f - 9: - tie728_s16_unaligned_conv2d_11c8 \input_v0, \input_front, \input_back, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder - add \input_ptr, \input_ptr, \dilation_x_offset - - addi \filter_w, \filter_w, -1 - bgei \filter_w, 2, 9b - 11: - tie728_s16_unaligned_conv2d_11c8 \input_v0, \input_front, \input_back, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder - - EE.MOVI.32.A \filter_offset_q, \filter_w, 1 - add \filter_ptr, \filter_ptr, \filter_w - add \input_ptr, \input_ptr, \dilation_y_offset - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 10b - - EE.MOVI.32.A \filter_offset_q, \filter_h, 2 - add \filter_ptr, \filter_ptr, \filter_h -.endm - -.macro tie728_s16_unaligned_conv2d_hwc1 input_v0, input_front, input_back, filter_v0, filter_front, filter_back, input_ptr, filter_ptr, c_div_x_1, c_remainder, dilation_x_offset, dilation_y_offset, filter_h, filter_w, args, temp, zero, filter_offset_q - l32i \filter_h, \args, 52 # filter_height - 10: - l32i \filter_w, \args, 56 # filter_width - beqi \filter_w, 1, 11f - 9: - tie728_s16_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \temp, \zero - add \input_ptr, \input_ptr, \dilation_x_offset - - addi \filter_w, \filter_w, -1 - bgei \filter_w, 2, 9b - 11: - tie728_s16_unaligned_conv2d_11c1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_div_x_1, \c_remainder, \temp, \zero - - EE.MOVI.32.A \filter_offset_q, \filter_w, 1 - add \filter_ptr, \filter_ptr, \filter_w - add \input_ptr, \input_ptr, \dilation_y_offset - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 10b - - EE.MOVI.32.A \filter_offset_q, \filter_h, 2 - add \filter_ptr, \filter_ptr, \filter_h -.endm - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_hwcn - .type dl_tie728_s16_unaligned_conv2d_hwcn, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_hwcn: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a12, a4, 96 # a12: n_div_x = output_channel / (vector_width / element_width) - l32i a13, a4, 64 # a13: mac_shift = output.exponent - filter.exponent - input.exponent - - blti a12, 1, tie728_s16_unaligned_conv2d_hwcn_n_remainder - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_hwcn_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_hwcn_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_hwcn_64b - tie728_s16_unaligned_conv2d_hwcn_32b: - tie728_s16_unaligned_conv2d_hwcn_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - tie728_s16_32b_aligned_vector_shift_store q0, a2, a13, a15 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_32b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_64b: - tie728_s16_unaligned_conv2d_hwcn_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - tie728_s16_64b_aligned_vector_shift_store q0, a2, a13 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_64b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_128b: - tie728_s16_unaligned_conv2d_hwcn_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - tie728_s16_128b_aligned_vector_shift_store q0, a2, a13 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_hwcn_n_remainder: - l32i a12, a4, 140 # a12: n_remainder - beqz a12, tie728_s16_unaligned_conv2d_hwcn_n_remainder_end - - l32i a5, a4, 160 - l32i a15, a4, 164 - EE.MOVI.32.Q q7, a5, 1 - EE.MOVI.32.Q q7, a15, 2 - l32i a5, a4, 168 # filter_ptr unaligned - - movi a13, 15 - sub a13, a13, a7 # a13: 15 - c_remainder - movi a14, 0 # a14: zero - - tie728_s16_unaligned_conv2d_hwcn_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a10, a11, a4, a13, a14, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_element_shift_store a2, a10, a15 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_n_remainder_loop - - tie728_s16_unaligned_conv2d_hwcn_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_hwcn_relu - .type dl_tie728_s16_unaligned_conv2d_hwcn_relu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_hwcn_relu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a12, a4, 96 # a12: n_div_x = output_channel / (vector_width / element_width) - movi a13, 0 # a13: zero - movi a14, 0 # a14: zero - - blti a12, 1, tie728_s16_unaligned_conv2d_hwcn_relu_n_remainder - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_hwcn_relu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_hwcn_relu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_hwcn_relu_64b - tie728_s16_unaligned_conv2d_hwcn_relu_32b: - tie728_s16_unaligned_conv2d_hwcn_relu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_32b_aligned_vector_shift_relu_store q0, a2, a10, a13, a14, a15 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_relu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_relu_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_relu_64b: - tie728_s16_unaligned_conv2d_hwcn_relu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_64b_aligned_vector_shift_relu_store q0, a2, a10, a13, a14 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_relu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_relu_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_relu_128b: - tie728_s16_unaligned_conv2d_hwcn_relu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_128b_aligned_vector_shift_relu_store q0, a2, a10, a13, a14 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_relu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_hwcn_relu_n_remainder: - l32i a12, a4, 140 # a12: n_remainder - beqz a12, tie728_s16_unaligned_conv2d_hwcn_relu_n_remainder_end - - l32i a5, a4, 160 - l32i a15, a4, 164 - EE.MOVI.32.Q q7, a5, 1 - EE.MOVI.32.Q q7, a15, 2 - l32i a5, a4, 168 # filter_ptr unaligned - - movi a13, 15 - sub a13, a13, a7 # a13: 15 - c_remainder - - tie728_s16_unaligned_conv2d_hwcn_relu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a10, a11, a4, a13, a14, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_element_shift_relu_store a2, a10, a15 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_relu_n_remainder_loop - - tie728_s16_unaligned_conv2d_hwcn_relu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_hwcn_leakyrelu - .type dl_tie728_s16_unaligned_conv2d_hwcn_leakyrelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_hwcn_leakyrelu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a12, a4, 96 # a12: n_div_x = output_channel / (vector_width / element_width) - l32i a13, a4, 76 # a13: activation_alpha - l32i a14, a4, 84 # a14: activation_shift - - blti a12, 1, tie728_s16_unaligned_conv2d_hwcn_leakyrelu_n_remainder - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_hwcn_leakyrelu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_hwcn_leakyrelu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_hwcn_leakyrelu_64b - tie728_s16_unaligned_conv2d_hwcn_leakyrelu_32b: - tie728_s16_unaligned_conv2d_hwcn_leakyrelu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_32b_aligned_vector_shift_relu_store q0, a2, a10, a13, a14, a15 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_leakyrelu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_leakyrelu_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_leakyrelu_64b: - tie728_s16_unaligned_conv2d_hwcn_leakyrelu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_64b_aligned_vector_shift_relu_store q0, a2, a10, a13, a14 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_leakyrelu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_leakyrelu_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_leakyrelu_128b: - tie728_s16_unaligned_conv2d_hwcn_leakyrelu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_128b_aligned_vector_shift_relu_store q0, a2, a10, a13, a14 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_leakyrelu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_hwcn_leakyrelu_n_remainder: - l32i a12, a4, 140 # a12: n_remainder - beqz a12, tie728_s16_unaligned_conv2d_hwcn_leakyrelu_n_remainder_end - - l32i a5, a4, 160 - l32i a15, a4, 164 - EE.MOVI.32.Q q7, a5, 1 - EE.MOVI.32.Q q7, a15, 2 - l32i a5, a4, 168 # filter_ptr unaligned - - EE.MOVI.32.Q q6, a13, 0 # q6[0]: activation_alpha - movi a13, 15 - sub a13, a13, a7 # a13: 15 - c_remainder - ssr a14 # ssr: activation_shift - movi a14, 0 # a14: zero - - tie728_s16_unaligned_conv2d_hwcn_leakyrelu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - tie728_s16_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a10, a11, a4, a13, a14, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - EE.MOVI.32.A q6, a11, 0 # a11: activation_alpha - tie728_s16_element_shift_leakyrelu_store a2, a10, a11, a15 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_leakyrelu_n_remainder_loop - - tie728_s16_unaligned_conv2d_hwcn_leakyrelu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_hwcn_prelu - .type dl_tie728_s16_unaligned_conv2d_hwcn_prelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_hwcn_prelu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a12, a4, 96 # a12: n_div_x = output_channel / (vector_width / element_width) - l32i a13, a4, 80 # a13: activation_alpha_ptr - l32i a14, a4, 84 # a14: activation_shift - - blti a12, 1, tie728_s16_unaligned_conv2d_hwcn_prelu_n_remainder - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_hwcn_prelu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_hwcn_prelu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_hwcn_prelu_64b - tie728_s16_unaligned_conv2d_hwcn_prelu_32b: - tie728_s16_unaligned_conv2d_hwcn_prelu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_32b_aligned_vector_shift_prelu_store q0, a2, a10, q1, a13, a14, a15 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_prelu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_prelu_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_prelu_64b: - tie728_s16_unaligned_conv2d_hwcn_prelu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_64b_aligned_vector_shift_prelu_store q0, a2, a10, q1, a13, a14 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_prelu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_prelu_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_prelu_128b: - tie728_s16_unaligned_conv2d_hwcn_prelu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - tie728_s16_128b_aligned_vector_shift_prelu_store q0, a2, a10, q1, a13, a14 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_prelu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_hwcn_prelu_n_remainder: - l32i a12, a4, 140 # a12: n_remainder - beqz a12, tie728_s16_unaligned_conv2d_hwcn_prelu_n_remainder_end - - l32i a5, a4, 160 - l32i a15, a4, 164 - EE.MOVI.32.Q q7, a5, 1 - EE.MOVI.32.Q q7, a15, 2 - l32i a5, a4, 168 # filter_ptr unaligned - - EE.MOVI.32.Q q6, a13, 0 # q6[0]: activation_alpha_ptr - movi a13, 15 - sub a13, a13, a7 # a13: 15 - c_remainder - ssr a14 # ssr: activation_shift - - tie728_s16_unaligned_conv2d_hwcn_prelu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - movi a14, 0 # a14: zero - tie728_s16_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a10, a11, a4, a13, a14, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - EE.MOVI.32.A q6, a11, 0 # a11: activation_alpha_ptr - tie728_s16_element_shift_prelu_store a2, a10, a11, a14, a15 - EE.MOVI.32.Q q6, a11, 0 # q6[0]: activation_alpha_ptr - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_prelu_n_remainder_loop - - tie728_s16_unaligned_conv2d_hwcn_prelu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_hwcn_bias - .type dl_tie728_s16_unaligned_conv2d_hwcn_bias, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_hwcn_bias: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a12, a4, 96 # a12: n_div_x = output_channel / (vector_width / element_width) - l32i a13, a4, 68 # a13: bias_ptr - l32i a14, a4, 64 # a14: mac_shift = output.exponent - filter.exponent - input.exponent - - blti a12, 1, tie728_s16_unaligned_conv2d_hwcn_bias_n_remainder - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_hwcn_bias_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_hwcn_bias_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_hwcn_bias_64b - tie728_s16_unaligned_conv2d_hwcn_bias_32b: - tie728_s16_unaligned_conv2d_hwcn_bias_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - tie728_s16_32b_aligned_vector_shift_bias_store q0, a2, a14, q1, a13, a15 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_32b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_bias_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_bias_64b: - tie728_s16_unaligned_conv2d_hwcn_bias_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - tie728_s16_64b_aligned_vector_shift_bias_store q0, a2, a14, q1, a13 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_64b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_bias_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_bias_128b: - tie728_s16_unaligned_conv2d_hwcn_bias_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - tie728_s16_128b_aligned_vector_shift_bias_store q0, a2, a14, q1, a13 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_hwcn_bias_n_remainder: - l32i a12, a4, 140 # a12: n_remainder - beqz a12, tie728_s16_unaligned_conv2d_hwcn_bias_n_remainder_end - - l32i a5, a4, 160 - l32i a15, a4, 164 - EE.MOVI.32.Q q7, a5, 1 - EE.MOVI.32.Q q7, a15, 2 - l32i a5, a4, 168 # filter_ptr unaligned - - EE.MOVI.32.Q q6, a13, 1 # q6[1]: bias_ptr - movi a13, 15 - sub a13, a13, a7 # a13: 15 - c_remainder - l32i a15, a4, 84 - ssr a15 # ssr: activation_shift - - tie728_s16_unaligned_conv2d_hwcn_bias_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - movi a14, 0 # a14: zero - tie728_s16_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a10, a11, a4, a13, a14, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - EE.MOVI.32.A q6, a11, 1 # a11: bias_ptr - tie728_s16_element_shift_bias_store a2, a10, a11, a14, a15 - EE.MOVI.32.Q q6, a11, 1 # q6[1]: bias_ptr - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_n_remainder_loop - - tie728_s16_unaligned_conv2d_hwcn_bias_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_hwcn_bias_relu - .type dl_tie728_s16_unaligned_conv2d_hwcn_bias_relu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_hwcn_bias_relu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a12, a4, 96 # a12: n_div_x = output_channel / (vector_width / element_width) - l32i a13, a4, 68 # a13: bias_ptr - l32i a14, a4, 76 # a14: activation_alpha - - blti a12, 1, tie728_s16_unaligned_conv2d_hwcn_bias_relu_n_remainder - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_hwcn_bias_relu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_hwcn_bias_relu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_hwcn_bias_relu_64b - tie728_s16_unaligned_conv2d_hwcn_bias_relu_32b: - tie728_s16_unaligned_conv2d_hwcn_bias_relu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 84 # a11: activation_shift - tie728_s16_32b_aligned_vector_shift_bias_relu_store q0, a2, a10, q1, a13, a14, a11, a15 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_relu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_bias_relu_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_bias_relu_64b: - tie728_s16_unaligned_conv2d_hwcn_bias_relu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 84 # a11: activation_shift - tie728_s16_64b_aligned_vector_shift_bias_relu_store q0, a2, a10, q1, a13, a14, a11 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_relu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_bias_relu_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_bias_relu_128b: - tie728_s16_unaligned_conv2d_hwcn_bias_relu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 84 # a11: activation_shift - tie728_s16_128b_aligned_vector_shift_bias_relu_store q0, a2, a10, q1, a13, a14, a11 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_relu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_hwcn_bias_relu_n_remainder: - l32i a12, a4, 140 # a12: n_remainder - beqz a12, tie728_s16_unaligned_conv2d_hwcn_bias_relu_n_remainder_end - - l32i a5, a4, 160 - l32i a15, a4, 164 - EE.MOVI.32.Q q7, a5, 1 - EE.MOVI.32.Q q7, a15, 2 - l32i a5, a4, 168 # filter_ptr unaligned - - EE.MOVI.32.Q q6, a13, 1 # q6[1]: bias_ptr - movi a13, 15 - sub a13, a13, a7 # a13: 15 - c_remainder - l32i a15, a4, 84 - ssr a15 # ssr: activation_shift - - tie728_s16_unaligned_conv2d_hwcn_bias_relu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - movi a14, 0 # a14: zero - tie728_s16_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a10, a11, a4, a13, a14, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - EE.MOVI.32.A q6, a11, 1 # a11: bias_ptr - tie728_s16_element_shift_bias_relu_store a2, a10, a11, a14, a15 - EE.MOVI.32.Q q6, a11, 1 # q6[1]: bias_ptr - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_relu_n_remainder_loop - - tie728_s16_unaligned_conv2d_hwcn_bias_relu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu - .type dl_tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a12, a4, 96 # a12: n_div_x = output_channel / (vector_width / element_width) - l32i a13, a4, 68 # a13: bias_ptr - l32i a14, a4, 76 # a14: activation_alpha - - blti a12, 1, tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_64b - tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_32b: - tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 84 # a11: activation_shift - tie728_s16_32b_aligned_vector_shift_bias_relu_store q0, a2, a10, q1, a13, a14, a11, a15 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_64b: - tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 84 # a11: activation_shift - tie728_s16_64b_aligned_vector_shift_bias_relu_store q0, a2, a10, q1, a13, a14, a11 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_128b: - tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 84 # a11: activation_shift - tie728_s16_128b_aligned_vector_shift_bias_relu_store q0, a2, a10, q1, a13, a14, a11 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder: - l32i a12, a4, 140 # a12: n_remainder - beqz a12, tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder_end - - l32i a5, a4, 160 - l32i a15, a4, 164 - EE.MOVI.32.Q q7, a5, 1 - EE.MOVI.32.Q q7, a15, 2 - l32i a5, a4, 168 # filter_ptr unaligned - - EE.MOVI.32.Q q6, a13, 1 # q6[1]: bias_ptr - EE.MOVI.32.Q q6, a14, 0 # q6[0]: activation_alpha - movi a13, 15 - sub a13, a13, a7 # a13: 15 - c_remainder - l32i a15, a4, 84 - ssr a15 # ssr: activation_shift - - tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - l32i a9, a4, 112 # a9: dilation_y_offset - movi a14, 0 # a14: zero - tie728_s16_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a10, a11, a4, a13, a14, q7 - - l32i a9, a4, 64 # a9: mac_shift = output.exponent - filter.exponent - input.exponent - EE.MOVI.32.A q6, a10, 1 # a10: bias_ptr - EE.MOVI.32.A q6, a11, 0 # a11: activation_alpha - tie728_s16_element_shift_bias_leakyrelu_store a2, a9, a10, a11, a14, a15 - EE.MOVI.32.Q q6, a10, 1 # q6[1]: bias_ptr - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder_loop - - tie728_s16_unaligned_conv2d_hwcn_bias_leakyrelu_n_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_conv2d_hwcn_bias_prelu - .type dl_tie728_s16_unaligned_conv2d_hwcn_bias_prelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_conv2d_hwcn_bias_prelu: - .align 4 - entry sp, 128 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 100 # a6: c_div_x_1 = input_channel / (vector_width / element_width) - 1 - l32i a7, a4, 136 # a7: c_remainder = input_channel % (vector_width / element_width) * sizeof(feature_t) - l32i a8, a4, 108 # a8: dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(feature_t) - l32i a9, a4, 112 # a9: dilation_y_offset - l32i a12, a4, 96 # a12: n_div_x = output_channel / (vector_width / element_width) - l32i a13, a4, 68 # a13: bias_ptr - l32i a14, a4, 80 # a14: activation_alpha_ptr - - blti a12, 1, tie728_s16_unaligned_conv2d_hwcn_bias_prelu_n_remainder - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - tie728_s16_unaligned_conv2d_hwcn_bias_prelu_n_div_x: - beqi a15, 0, tie728_s16_unaligned_conv2d_hwcn_bias_prelu_128b - beqi a15, 8, tie728_s16_unaligned_conv2d_hwcn_bias_prelu_64b - tie728_s16_unaligned_conv2d_hwcn_bias_prelu_32b: - tie728_s16_unaligned_conv2d_hwcn_bias_prelu_32b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 84 # a11: activation_shift - tie728_s16_32b_aligned_vector_shift_bias_prelu_store q0, a2, a10, q1, a13, q2, a14, a11, a15 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_prelu_32b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_bias_prelu_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_bias_prelu_64b: - tie728_s16_unaligned_conv2d_hwcn_bias_prelu_64b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 84 # a11: activation_shift - tie728_s16_64b_aligned_vector_shift_bias_prelu_store q0, a2, a10, q1, a13, q2, a14, a11 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_prelu_64b_multiple_loop - j tie728_s16_unaligned_conv2d_hwcn_bias_prelu_n_remainder - - tie728_s16_unaligned_conv2d_hwcn_bias_prelu_128b: - tie728_s16_unaligned_conv2d_hwcn_bias_prelu_128b_multiple_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.QACC - - tie728_s16_unaligned_conv2d_hwc8 q0, q1, q2, q3, q4, a15, a5, a6, a7, a8, a9, a10, a11, a4, q7 - - l32i a10, a4, 64 # a10: mac_shift = output.exponent - filter.exponent - input.exponent - l32i a11, a4, 84 # a11: activation_shift - tie728_s16_128b_aligned_vector_shift_bias_prelu_store q0, a2, a10, q1, a13, q2, a14, a11 - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_prelu_128b_multiple_loop - - - tie728_s16_unaligned_conv2d_hwcn_bias_prelu_n_remainder: - l32i a12, a4, 140 # a12: n_remainder - beqz a12, tie728_s16_unaligned_conv2d_hwcn_bias_prelu_n_remainder_end - - l32i a5, a4, 160 - l32i a15, a4, 164 - EE.MOVI.32.Q q7, a5, 1 - EE.MOVI.32.Q q7, a15, 2 - l32i a5, a4, 168 # filter_ptr unaligned - - EE.MOVI.32.Q q6, a13, 1 # q6[1]: bias_ptr - EE.MOVI.32.Q q6, a14, 0 # q6[0]: activation_alpha_ptr - movi a13, 15 - sub a13, a13, a7 # a13: 15 - c_remainder - l32i a15, a4, 84 - ssr a15 # ssr: activation_shift - - tie728_s16_unaligned_conv2d_hwcn_bias_prelu_n_remainder_loop: - mov a15, a3 # a15: input_ptr - EE.ZERO.ACCX - - l32i a9, a4, 112 # a9: dilation_y_offset - movi a14, 0 # a14: zero - tie728_s16_unaligned_conv2d_hwc1 q0, q1, q2, q3, q4, q5, a15, a5, a6, a7, a8, a9, a10, a11, a4, a13, a14, q7 - - l32i a9, a4, 64 # a9: mac_shift = output.exponent - filter.exponent - input.exponent - EE.MOVI.32.A q6, a10, 1 # a10: bias_ptr - EE.MOVI.32.A q6, a11, 0 # a11: activation_alpha_ptr - tie728_s16_element_shift_bias_prelu_store a2, a9, a10, a11, a14, a15 - EE.MOVI.32.Q q6, a10, 1 # q6[1]: bias_ptr - EE.MOVI.32.Q q6, a11, 0 # q6[0]: activation_alpha_ptr - - addi a12, a12, -1 - bnez a12, tie728_s16_unaligned_conv2d_hwcn_bias_prelu_n_remainder_loop - - tie728_s16_unaligned_conv2d_hwcn_bias_prelu_n_remainder_end: - - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_unaligned_depthwise_conv2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_unaligned_depthwise_conv2d.S deleted file mode 100644 index 5e92e303..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s16_unaligned_depthwise_conv2d.S +++ /dev/null @@ -1,1355 +0,0 @@ -#include "dl_tie728_s16_unaligned.S" - -############################################################################################################################################################ -#### -#### tie728_s16_unaligned_depthwise_conv2d_33c1 series -#### -############################################################################################################################################################ -.macro tie728_s16_unaligned_depthwise_conv2d_3381 input_v0, input_v1, input_v2, input_back, filter_v0, filter_v1, filter_v2, input_ptr, filter_ptr, dilation_x_offset_16, dilation_y_offset_16, next_3381_16 - EE.ZERO.QACC - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_y_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v2, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - EE.SRC.Q.LD.IP \input_v2, \input_ptr, 16, \input_v1, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_y_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v2, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - EE.SRC.Q.LD.IP \input_v2, \input_ptr, 16, \input_v1, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \next_3381_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v2, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - EE.SRC.Q.LD.IP \input_v2, \input_ptr, 16, \input_v1, \input_back -.endm - -.macro tie728_s16_unaligned_depthwise_conv2d_3381_last input_v0, input_v1, input_v2, input_back, filter_v0, filter_v1, input_ptr, filter_ptr, dilation_x_offset_16, dilation_y_offset_16, next_3381_16 - EE.ZERO.QACC - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_y_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v2, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v2, \filter_v0 - EE.SRC.Q.LD.IP \input_v2, \input_ptr, 16, \input_v1, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_y_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v0, \filter_v1 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v2, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v1, \filter_v0 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v1 - EE.SRC.Q.LD.IP \input_v2, \input_ptr, 16, \input_v1, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \next_3381_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q \input_v2, \input_v2, \input_back - - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VMULAS.S16.QACC \input_v2, \filter_v0 -.endm - -.macro tie728_s16_unaligned_depthwise_conv2d_11r1 input_v0, input_front, input_back, filter_v0, filter_front, filter_back, input_ptr, filter_ptr, c_remainder, forward - EE.LD.128.USAR.IP \input_v0, \input_ptr, 16 - EE.VLD.128.XP \input_back, \input_ptr, \forward - EE.SRC.Q \input_v0, \input_v0, \input_back - - EE.LD.128.USAR.XP \filter_v0, \filter_ptr, \c_remainder - EE.VLD.128.IP \filter_back, \filter_ptr, 0 - EE.SRC.Q \filter_v0, \filter_v0, \filter_back - - EE.VMULAS.S16.QACC \input_v0, \filter_v0 -.endm - -.macro tie728_s16_unaligned_depthwise_conv2d_33r1 input_v0, input_front, input_back, filter_v0, filter_front, filter_back, input_ptr, filter_ptr, dilation_x_offset_16, dilation_y_offset_16, c_remainder - EE.ZERO.QACC - - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_16 - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_16 - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_y_offset_16 - - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_16 - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_16 - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_y_offset_16 - - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_16 - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_16 - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_y_offset_16 -.endm - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_depthwise_conv2d_33c1 - .type dl_tie728_s16_unaligned_depthwise_conv2d_33c1, @function - # .section .iram1 -dl_tie728_s16_unaligned_depthwise_conv2d_33c1: - .align 4 - entry sp, 128 - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 124 - addi a6, a6, -16 # a6: dilation_x_offset - 16 - l32i a7, a4, 128 - addi a7, a7, -16 # a7: dilation_y_offset - 16 - l32i a8, a4, 132 - addi a8, a8, -16 # a8: next_3381 - 16 - l32i a9, a4, 100 # a9: c_div_x_1 - l32i a10, a4, 64 # a10: mac_shift - # l32i a11, a4, 68 # a11: bias_ptr - l32i a12, a4, 76 # a12: activation_alpha - l32i a13, a4, 84 # a13: activation_shift - - blti a9, 0, tie728_s16_unaligned_depthwise_conv2d_33c1_c_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.SRC.Q.LD.IP q4, a5, 16, q0, q3 # q4: filter_v0; q0: input_v0 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.SRC.Q.LD.IP q2, a3, 16, q1, q3 # q2: input_v2; q1: input_v1 - - tie728_s16_unaligned_depthwise_conv2d_33c1_c_div_x: - beqi a15, 0, tie728_s16_unaligned_depthwise_conv2d_33c1_128b - beqi a15, 8, tie728_s16_unaligned_depthwise_conv2d_33c1_64b - tie728_s16_unaligned_depthwise_conv2d_33c1_32b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_32b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_32b_aligned_vector_shift_store q3, a2, a10, a14 - - tie728_s16_unaligned_depthwise_conv2d_33c1_32b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_32b_aligned_vector_shift_store q3, a2, a10, a14 - - j tie728_s16_unaligned_depthwise_conv2d_33c1_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_33c1_64b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_64b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_64b_aligned_vector_shift_store q3, a2, a10 - - tie728_s16_unaligned_depthwise_conv2d_33c1_64b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_64b_aligned_vector_shift_store q3, a2, a10 - - j tie728_s16_unaligned_depthwise_conv2d_33c1_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_33c1_128b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_128b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_128b_aligned_vector_shift_store q3, a2, a10 - - tie728_s16_unaligned_depthwise_conv2d_33c1_128b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_128b_aligned_vector_shift_store q3, a2, a10 - - - tie728_s16_unaligned_depthwise_conv2d_33c1_c_remainder: - l32i a9, a4, 136 # a9: c_remainder - beqz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_c_remainder_end - srli a14, a9, 1 - tie728_s16_unaligned_depthwise_conv2d_33r1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a9 - tie728_s16_variable_vector_shift_store q0, a2, a10, a14, a15 - - tie728_s16_unaligned_depthwise_conv2d_33c1_c_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_depthwise_conv2d_33c1_relu - .type dl_tie728_s16_unaligned_depthwise_conv2d_33c1_relu, @function - # .section .iram1 -dl_tie728_s16_unaligned_depthwise_conv2d_33c1_relu: - .align 4 - entry sp, 128 - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 124 - addi a6, a6, -16 # a6: dilation_x_offset - 16 - l32i a7, a4, 128 - addi a7, a7, -16 # a7: dilation_y_offset - 16 - l32i a8, a4, 132 - addi a8, a8, -16 # a8: next_3381 - 16 - l32i a9, a4, 100 # a9: c_div_x_1 - l32i a10, a4, 64 # a10: mac_shift - # l32i a11, a4, 68 # a11: bias_ptr - l32i a12, a4, 76 # a12: activation_alpha - l32i a13, a4, 84 # a13: activation_shift - - blti a9, 0, tie728_s16_unaligned_depthwise_conv2d_33c1_relu_c_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.SRC.Q.LD.IP q4, a5, 16, q0, q3 # q4: filter_v0; q0: input_v0 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.SRC.Q.LD.IP q2, a3, 16, q1, q3 # q2: input_v2; q1: input_v1 - - tie728_s16_unaligned_depthwise_conv2d_33c1_relu_c_div_x: - beqi a15, 0, tie728_s16_unaligned_depthwise_conv2d_33c1_relu_128b - beqi a15, 8, tie728_s16_unaligned_depthwise_conv2d_33c1_relu_64b - tie728_s16_unaligned_depthwise_conv2d_33c1_relu_32b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_relu_32b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_32b_aligned_vector_shift_relu_store q3, a2, a10, a12, a13, a14 - - tie728_s16_unaligned_depthwise_conv2d_33c1_relu_32b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_32b_aligned_vector_shift_relu_store q3, a2, a10, a12, a13, a14 - - j tie728_s16_unaligned_depthwise_conv2d_33c1_relu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_33c1_relu_64b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_relu_64b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_64b_aligned_vector_shift_relu_store q3, a2, a10, a12, a13 - - tie728_s16_unaligned_depthwise_conv2d_33c1_relu_64b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_64b_aligned_vector_shift_relu_store q3, a2, a10, a12, a13 - - j tie728_s16_unaligned_depthwise_conv2d_33c1_relu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_33c1_relu_128b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_relu_128b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_128b_aligned_vector_shift_relu_store q3, a2, a10, a12, a13 - - tie728_s16_unaligned_depthwise_conv2d_33c1_relu_128b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_128b_aligned_vector_shift_relu_store q3, a2, a10, a12, a13 - - - tie728_s16_unaligned_depthwise_conv2d_33c1_relu_c_remainder: - l32i a9, a4, 136 # a9: c_remainder - beqz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_relu_c_remainder_end - srli a14, a9, 1 - tie728_s16_unaligned_depthwise_conv2d_33r1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a9 - tie728_s16_variable_vector_shift_relu_store q0, a2, a10, a12, a13, a14, a15 - - tie728_s16_unaligned_depthwise_conv2d_33c1_relu_c_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_depthwise_conv2d_33c1_prelu - .type dl_tie728_s16_unaligned_depthwise_conv2d_33c1_prelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_depthwise_conv2d_33c1_prelu: - .align 4 - entry sp, 128 - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 124 - addi a6, a6, -16 # a6: dilation_x_offset - 16 - l32i a7, a4, 128 - addi a7, a7, -16 # a7: dilation_y_offset - 16 - l32i a8, a4, 132 - addi a8, a8, -16 # a8: next_3381 - 16 - l32i a9, a4, 100 # a9: c_div_x_1 - l32i a10, a4, 64 # a10: mac_shift - # l32i a11, a4, 68 # a11: bias_ptr - l32i a12, a4, 80 # a12: activation_alpha_ptr - l32i a13, a4, 84 # a13: activation_shift - - blti a9, 0, tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_c_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.SRC.Q.LD.IP q4, a5, 16, q0, q3 # q4: filter_v0; q0: input_v0 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.SRC.Q.LD.IP q2, a3, 16, q1, q3 # q2: input_v2; q1: input_v1 - - tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_c_div_x: - beqi a15, 0, tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_128b - beqi a15, 8, tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_64b - tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_32b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_32b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_32b_aligned_vector_shift_prelu_store q3, a2, a10, q5, a12, a13, a14 - - tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_32b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_32b_aligned_vector_shift_prelu_store q3, a2, a10, q5, a12, a13, a14 - - j tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_64b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_64b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_64b_aligned_vector_shift_prelu_store q3, a2, a10, q5, a12, a13 - - tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_64b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_64b_aligned_vector_shift_prelu_store q3, a2, a10, q5, a12, a13 - - j tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_128b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_128b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_128b_aligned_vector_shift_prelu_store q3, a2, a10, q5, a12, a13 - - tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_128b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_128b_aligned_vector_shift_prelu_store q3, a2, a10, q5, a12, a13 - - - tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_c_remainder: - l32i a9, a4, 136 # a9: c_remainder - beqz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_c_remainder_end - srli a14, a9, 1 - tie728_s16_unaligned_depthwise_conv2d_33r1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a9 - tie728_s16_variable_vector_shift_prelu_store q0, a2, a10, q1, a12, a13, a14, a15 - - tie728_s16_unaligned_depthwise_conv2d_33c1_prelu_c_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias - .type dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias, @function - # .section .iram1 -dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias: - .align 4 - entry sp, 128 - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 124 - addi a6, a6, -16 # a6: dilation_x_offset - 16 - l32i a7, a4, 128 - addi a7, a7, -16 # a7: dilation_y_offset - 16 - l32i a8, a4, 132 - addi a8, a8, -16 # a8: next_3381 - 16 - l32i a9, a4, 100 # a9: c_div_x_1 - l32i a10, a4, 64 # a10: mac_shift - l32i a11, a4, 68 # a11: bias_ptr - # l32i a12, a4, 76 # a12: activation_alpha - # l32i a13, a4, 84 # a13: activation_shift - - blti a9, 0, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_c_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.SRC.Q.LD.IP q4, a5, 16, q0, q3 # q4: filter_v0; q0: input_v0 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.SRC.Q.LD.IP q2, a3, 16, q1, q3 # q2: input_v2; q1: input_v1 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_c_div_x: - beqi a15, 0, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_128b - beqi a15, 8, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_64b - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_32b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_32b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_32b_aligned_vector_shift_bias_store q3, a2, a10, q5, a11, a14 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_32b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_32b_aligned_vector_shift_bias_store q3, a2, a10, q5, a11, a14 - - j tie728_s16_unaligned_depthwise_conv2d_33c1_bias_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_64b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_64b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_64b_aligned_vector_shift_bias_store q3, a2, a10, q5, a11 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_64b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_64b_aligned_vector_shift_bias_store q3, a2, a10, q5, a11 - - j tie728_s16_unaligned_depthwise_conv2d_33c1_bias_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_128b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_128b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_128b_aligned_vector_shift_bias_store q3, a2, a10, q5, a11 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_128b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_128b_aligned_vector_shift_bias_store q3, a2, a10, q5, a11 - - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_c_remainder: - l32i a9, a4, 136 # a9: c_remainder - beqz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_c_remainder_end - srli a14, a9, 1 - tie728_s16_unaligned_depthwise_conv2d_33r1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a9 - tie728_s16_variable_vector_shift_bias_store q0, a2, a10, q1, a11, a14, a15 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_c_remainder_end: - - retw - - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu - .type dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu, @function - # .section .iram1 -dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu: - .align 4 - entry sp, 128 - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 124 - addi a6, a6, -16 # a6: dilation_x_offset - 16 - l32i a7, a4, 128 - addi a7, a7, -16 # a7: dilation_y_offset - 16 - l32i a8, a4, 132 - addi a8, a8, -16 # a8: next_3381 - 16 - l32i a9, a4, 100 # a9: c_div_x_1 - l32i a10, a4, 64 # a10: mac_shift - l32i a11, a4, 68 # a11: bias_ptr - l32i a12, a4, 76 # a12: activation_alpha - l32i a13, a4, 84 # a13: activation_shift - - blti a9, 0, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_c_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.SRC.Q.LD.IP q4, a5, 16, q0, q3 # q4: filter_v0; q0: input_v0 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.SRC.Q.LD.IP q2, a3, 16, q1, q3 # q2: input_v2; q1: input_v1 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_c_div_x: - beqi a15, 0, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_128b - beqi a15, 8, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_64b - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_32b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_32b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_32b_aligned_vector_shift_bias_relu_store q3, a2, a10, q5, a11, a12, a13, a14 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_32b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_32b_aligned_vector_shift_bias_relu_store q3, a2, a10, q5, a11, a12, a13, a14 - - j tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_64b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_64b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_64b_aligned_vector_shift_bias_relu_store q3, a2, a10, q5, a11, a12, a13 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_64b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_64b_aligned_vector_shift_bias_relu_store q3, a2, a10, q5, a11, a12, a13 - - j tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_128b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_128b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_128b_aligned_vector_shift_bias_relu_store q3, a2, a10, q5, a11, a12, a13 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_128b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_128b_aligned_vector_shift_bias_relu_store q3, a2, a10, q5, a11, a12, a13 - - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_c_remainder: - l32i a9, a4, 136 # a9: c_remainder - beqz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_c_remainder_end - srli a14, a9, 1 - tie728_s16_unaligned_depthwise_conv2d_33r1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a9 - tie728_s16_variable_vector_shift_bias_relu_store q0, a2, a10, q1, a11, a12, a13, a14, a15 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_relu_c_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu - .type dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu: - .align 4 - entry sp, 128 - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 124 - addi a6, a6, -16 # a6: dilation_x_offset - 16 - l32i a7, a4, 128 - addi a7, a7, -16 # a7: dilation_y_offset - 16 - l32i a8, a4, 132 - addi a8, a8, -16 # a8: next_3381 - 16 - l32i a9, a4, 100 # a9: c_div_x_1 - l32i a10, a4, 64 # a10: mac_shift - l32i a11, a4, 68 # a11: bias_ptr - l32i a12, a4, 80 # a12: activation_alpha_ptr - l32i a13, a4, 84 # a13: activation_shift - - blti a9, 0, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_c_remainder - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a15 # a15: output_sar_byte - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.SRC.Q.LD.IP q4, a5, 16, q0, q3 # q4: filter_v0; q0: input_v0 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.SRC.Q.LD.IP q2, a3, 16, q1, q3 # q2: input_v2; q1: input_v1 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_c_div_x: - beqi a15, 0, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_128b - beqi a15, 8, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_64b - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_32b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_32b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_32b_aligned_vector_shift_bias_prelu_store q3, a2, a10, q5, a11, q6, a12, a13, a14 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_32b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_32b_aligned_vector_shift_bias_prelu_store q3, a2, a10, q5, a11, q6, a12, a13, a14 - - j tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_64b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_64b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_64b_aligned_vector_shift_bias_prelu_store q3, a2, a10, q5, a11, q6, a12, a13 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_64b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_64b_aligned_vector_shift_bias_prelu_store q3, a2, a10, q5, a11, q6, a12, a13 - - j tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_128b: - loopgtz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_128b_last - tie728_s16_unaligned_depthwise_conv2d_3381 q0, q1, q2, q3, q4, q5, q6, a3, a5, a6, a7, a8 - tie728_s16_128b_aligned_vector_shift_bias_prelu_store q3, a2, a10, q5, a11, q6, a12, a13 - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_128b_last: - tie728_s16_unaligned_depthwise_conv2d_3381_last q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s16_128b_aligned_vector_shift_bias_prelu_store q3, a2, a10, q5, a11, q6, a12, a13 - - - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_c_remainder: - l32i a9, a4, 136 # a9: c_remainder - beqz a9, tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_c_remainder_end - srli a14, a9, 1 - tie728_s16_unaligned_depthwise_conv2d_33r1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a9 - tie728_s16_variable_vector_shift_bias_prelu_store q0, a2, a10, q1, a11, q2, a12, a13, a14, a15 - tie728_s16_unaligned_depthwise_conv2d_33c1_bias_prelu_c_remainder_end: - - retw - - - - - - - - - -############################################################################################################################################################ -#### -#### tie728_s16_unaligned_depthwise_conv2d_hwc1 series -#### -############################################################################################################################################################ -.macro tie728_s16_unaligned_depthwise_conv2d_1w81 input_v0, input_v1, input_back, input_ptr, filter_v0, filter_ptr, dilation_x_offset_16, dilation_y_offset, filter_w, filter_w_rs1_1 filter_y_offset - loopgtz \filter_w_rs1_1, 1f - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v1, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v0 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back - 1: - - bbci \filter_w, 0, 2f - # three 8-input-element left - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v1, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_y_offset - EE.VMULAS.S16.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v1, \filter_v0 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q \input_v0, \input_v1, \input_back - EE.LD.128.USAR.IP \input_v1, \input_ptr, 16 - j 3f - - 2: # two 8-input-element left - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_y_offset - EE.VMULAS.S16.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v1, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v0 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back - 3: -.endm - -.macro tie728_s16_unaligned_depthwise_conv2d_1w81_last input_v0 input_v1 input_back input_ptr filter_v0 filter_ptr dilation_x_offset_16 filter_w filter_w_rs1_1 next_hws1 filter_y_offset - loopgtz \filter_w_rs1_1, 4f - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v1, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v0 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back - 4: - - bbci \filter_w, 0, 5f - # three 8-input-element left - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VMULAS.S16.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v1, \input_back - - EE.LD.128.USAR.XP \input_back, \input_ptr, \next_hws1 - EE.VMULAS.S16.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v1, \filter_v0 - EE.SRC.Q \input_v0, \input_v0, \input_back - - EE.VMULAS.S16.QACC \input_v0, \filter_v0 - j 6f - - 5: # two 8-input-element left - EE.LD.128.USAR.XP \input_back, \input_ptr, \next_hws1 - EE.VMULAS.S16.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - EE.SRC.Q \input_v1, \input_v1, \input_back - - EE.VMULAS.S16.QACC \input_v1, \filter_v0 - 6: -.endm - -.macro tie728_s16_unaligned_depthwise_conv2d_hw81 input_v0, input_v1, input_back, filter_v0, input_ptr, filter_ptr, dilation_x_offset_16, dilation_y_offset_16, next_hws1, filter_h, filter_w, filter_w_rs1_1, args, filter_offset_q, filter_y_offset - l32i \filter_h, \args, 52 # filter_height - l32i \filter_w, \args, 56 # filter_width - - EE.ZERO.QACC - - blti \filter_w, 2, 9f - # filter_w >= 2 - EE.LD.128.USAR.IP \input_v1, \input_ptr, 16 - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_x_offset_16 - EE.VLD.128.IP \filter_v0, \filter_ptr, 16 # filter_v0 - EE.SRC.Q \input_v0, \input_v1, \input_back # input_v0 - EE.LD.128.USAR.IP \input_v1, \input_ptr, 16 - - blti \filter_h, 2, 8f - 7: - tie728_s16_unaligned_depthwise_conv2d_1w81 \input_v0, \input_v1, \input_back, \input_ptr, \filter_v0, \filter_ptr, \dilation_x_offset_16, \dilation_y_offset_16, \filter_w, \filter_w_rs1_1, \filter_y_offset - addi \filter_h, \filter_h, -1 - bgei \filter_h, 2, 7b - 8: # last y - tie728_s16_unaligned_depthwise_conv2d_1w81_last \input_v0, \input_v1, \input_back, \input_ptr, \filter_v0, \filter_ptr, \dilation_x_offset_16, \filter_w, \filter_w_rs1_1, \next_hws1, \filter_y_offset - j 12f - - 9: # filter_w == 1 - EE.LD.128.USAR.IP \input_v1, \input_ptr, 16 - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_y_offset_16 - EE.VLD.128.XP \filter_v0, \filter_ptr, \filter_y_offset # filter_v0 - EE.SRC.Q \input_v0, \input_v1, \input_back # input_v0 - - blti \filter_h, 2, 11f - addi \filter_h, \filter_h, -1 - loopgtz \filter_h, 10f - EE.LD.128.USAR.IP \input_v1, \input_ptr, 16 - EE.LD.128.USAR.XP \input_back, \input_ptr, \dilation_y_offset_16 - EE.VMULAS.S16.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - EE.SRC.Q \input_v0, \input_v1, \input_back - 10: - 11: # last y - EE.VMULAS.S16.QACC \input_v0, \filter_v0 - sub \input_ptr, \input_ptr, \dilation_y_offset_16 - add \input_ptr, \input_ptr, \next_hws1 - - 12: - EE.MOVI.32.A \filter_offset_q, \filter_h, 2 - add \filter_ptr, \filter_ptr, \filter_h -.endm - -.macro tie728_s16_unaligned_depthwise_conv2d_11r1_padding input_v0, input_front, input_back, filter_v0, filter_front, filter_back, input_ptr, filter_ptr, c_remainder, forward, filter_y_offset - EE.LD.128.USAR.IP \input_v0, \input_ptr, 16 - EE.VLD.128.XP \input_back, \input_ptr, \forward - EE.SRC.Q \input_v0, \input_v0, \input_back - - EE.LD.128.USAR.XP \filter_v0, \filter_ptr, \c_remainder - EE.VLD.128.XP \filter_back, \filter_ptr, \filter_y_offset - EE.SRC.Q \filter_v0, \filter_v0, \filter_back - - EE.VMULAS.S16.QACC \input_v0, \filter_v0 -.endm - -.macro tie728_s16_unaligned_depthwise_conv2d_hwr1 input_v0, input_front, input_back, filter_v0, filter_front, filter_back, input_ptr, filter_ptr, dilation_x_offset_16, dilation_y_offset_16, filter_h, filter_w, filter_w_rs1_1, c_remainder, args, filter_y_offset - l32i \filter_h, \args, 52 # filter_height - l32i \filter_w, \args, 56 # filter_width - - EE.ZERO.QACC - - blti \filter_w, 2, 5f - 4: - loopgtz \filter_w_rs1_1, 1f - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_16 - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_16 - 1: - bbci \filter_w, 0, 2f - # 3 left - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_16 - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_16 - tie728_s16_unaligned_depthwise_conv2d_11r1_padding \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_y_offset_16, \filter_y_offset - j 3f - 2: - # 2 left - tie728_s16_unaligned_depthwise_conv2d_11r1 \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_x_offset_16 - tie728_s16_unaligned_depthwise_conv2d_11r1_padding \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_y_offset_16, \filter_y_offset - 3: - addi \filter_h, \filter_h, -1 - bgei \filter_h, 1, 4b - - j 7f - - 5: - # filter_w == 1 - loopgtz \filter_h, 6f - tie728_s16_unaligned_depthwise_conv2d_11r1_padding \input_v0, \input_front, \input_back, \filter_v0, \filter_front, \filter_back, \input_ptr, \filter_ptr, \c_remainder, \dilation_y_offset_16, \filter_y_offset - 6: - 7: -.endm - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_depthwise_conv2d_hwc1 - .type dl_tie728_s16_unaligned_depthwise_conv2d_hwc1, @function - # .section .iram1 -dl_tie728_s16_unaligned_depthwise_conv2d_hwc1: - .align 4 - entry sp, 128 - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - l32i a11, a4, 144 - l32i a15, a4, 60 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 124 - addi a6, a6, -16 # a6: dilation_x_offset - 16 - l32i a7, a4, 128 - addi a7, a7, -16 # a7: dilation_y_offset - 16 - # a9 - # a10 - l32i a11, a4, 148 # a11: filter_w_rs1_1 - l32i a12, a4, 100 # a12: c_div_x_1 - l32i a13, a4, 64 # a13: mac_shift - # l32i a14, a4, 76 # a14: activation_alpha - # l32i a15, a4, 84 # a15: activation_shift - - - blti a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_c_remainder - l32i a8, a4, 132 - addi a8, a8, -16 # a8: next_hw81 - 16 - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a9 # a9: output_sar_byte - - tie728_s16_unaligned_depthwise_conv2d_hwc1_c_div_x: - beqi a9, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_128b - beqi a9, 8, tie728_s16_unaligned_depthwise_conv2d_hwc1_64b - tie728_s16_unaligned_depthwise_conv2d_hwc1_32b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_32b_multiple_loop: - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a15 - tie728_s16_32b_aligned_vector_shift_store q0, a2, a13, a9 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_32b_multiple_loop - j tie728_s16_unaligned_depthwise_conv2d_hwc1_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_hwc1_64b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_64b_multiple_loop: - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a15 - tie728_s16_64b_aligned_vector_shift_store q0, a2, a13 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_64b_multiple_loop - j tie728_s16_unaligned_depthwise_conv2d_hwc1_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_hwc1_128b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_128b_multiple_loop: - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a15 - tie728_s16_128b_aligned_vector_shift_store q0, a2, a13 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_128b_multiple_loop - - - tie728_s16_unaligned_depthwise_conv2d_hwc1_c_remainder: - l32i a12, a4, 136 # a12: c_remainder - beqz a12, tie728_s16_unaligned_depthwise_conv2d_hwc1_c_remainder_end - srli a8, a12, 1 - l32i a5, a4, 168 # filter_ptr unaligned - l32i a15, a4, 160 - tie728_s16_unaligned_depthwise_conv2d_hwr1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a9, a10, a11, a12, a4, a15 - tie728_s16_variable_vector_shift_store q0, a2, a13, a8, a9 - - tie728_s16_unaligned_depthwise_conv2d_hwc1_c_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_relu - .type dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_relu, @function - # .section .iram1 -dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_relu: - .align 4 - entry sp, 128 - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 124 - addi a6, a6, -16 # a6: dilation_x_offset - 16 - l32i a7, a4, 128 - addi a7, a7, -16 # a7: dilation_y_offset - 16 - # a9 - # a10 - l32i a11, a4, 148 # a11: filter_w_rs1_1 - l32i a12, a4, 100 # a12: c_div_x_1 - l32i a13, a4, 64 # a13: mac_shift - l32i a14, a4, 76 # a14: activation_alpha - l32i a15, a4, 84 # a15: activation_shift - - EE.MOVI.32.Q q7, a13, 3 - - - blti a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_c_remainder - l32i a8, a4, 132 - addi a8, a8, -16 # a8: next_hw81 - 16 - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a9 # a9: output_sar_byte - - tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_c_div_x: - beqi a9, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_128b - beqi a9, 8, tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_64b - tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_32b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_32b_multiple_loop: - EE.MOVI.32.A q7, a13, 1 - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a13 - EE.MOVI.32.A q7, a13, 3 - tie728_s16_32b_aligned_vector_shift_relu_store q0, a2, a13, a14, a15, a9 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_32b_multiple_loop - j tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_64b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_64b_multiple_loop: - EE.MOVI.32.A q7, a13, 1 - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a13 - EE.MOVI.32.A q7, a13, 3 - tie728_s16_64b_aligned_vector_shift_relu_store q0, a2, a13, a14, a15 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_64b_multiple_loop - j tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_128b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_128b_multiple_loop: - EE.MOVI.32.A q7, a13, 1 - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a13 - EE.MOVI.32.A q7, a13, 3 - tie728_s16_128b_aligned_vector_shift_relu_store q0, a2, a13, a14, a15 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_128b_multiple_loop - - - tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_c_remainder: - l32i a12, a4, 136 # a12: c_remainder - beqz a12, tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_c_remainder_end - srli a8, a12, 1 - l32i a5, a4, 168 # filter_ptr unaligned - l32i a13, a4, 160 - tie728_s16_unaligned_depthwise_conv2d_hwr1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a9, a10, a11, a12, a4, a13 - EE.MOVI.32.A q7, a13, 3 - tie728_s16_variable_vector_shift_relu_store q0, a2, a13, a14, a15, a8, a9 - - tie728_s16_unaligned_depthwise_conv2d_hwc1_relu_c_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu - .type dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu: - .align 4 - entry sp, 128 - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 124 - addi a6, a6, -16 # a6: dilation_x_offset - 16 - l32i a7, a4, 128 - addi a7, a7, -16 # a7: dilation_y_offset - 16 - # a9 - # a10 - l32i a11, a4, 148 # a11: filter_w_rs1_1 - l32i a12, a4, 100 # a12: c_div_x_1 - l32i a13, a4, 64 # a13: mac_shift - l32i a14, a4, 80 # a14: activation_alpha_ptr - l32i a15, a4, 84 # a15: activation_shift - - EE.MOVI.32.Q q7, a13, 3 - - - blti a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_c_remainder - l32i a8, a4, 132 - addi a8, a8, -16 # a8: next_hw81 - 16 - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a9 # a9: output_sar_byte - - tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_c_div_x: - beqi a9, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_128b - beqi a9, 8, tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_64b - tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_32b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_32b_multiple_loop: - EE.MOVI.32.A q7, a13, 1 - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a13 - EE.MOVI.32.A q7, a13, 3 - tie728_s16_32b_aligned_vector_shift_prelu_store q0, a2, a13, q1, a14, a15, a9 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_32b_multiple_loop - j tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_64b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_64b_multiple_loop: - EE.MOVI.32.A q7, a13, 1 - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a13 - EE.MOVI.32.A q7, a13, 3 - tie728_s16_64b_aligned_vector_shift_prelu_store q0, a2, a13, q1, a14, a15 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_64b_multiple_loop - j tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_128b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_128b_multiple_loop: - EE.MOVI.32.A q7, a13, 1 - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a13 - EE.MOVI.32.A q7, a13, 3 - tie728_s16_128b_aligned_vector_shift_prelu_store q0, a2, a13, q1, a14, a15 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_128b_multiple_loop - - - tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_c_remainder: - l32i a12, a4, 136 # a12: c_remainder - beqz a12, tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_c_remainder_end - srli a8, a12, 1 - l32i a5, a4, 168 # filter_ptr unaligned - l32i a13, a4, 160 - tie728_s16_unaligned_depthwise_conv2d_hwr1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a9, a10, a11, a12, a4, a13 - EE.MOVI.32.A q7, a13, 3 - tie728_s16_variable_vector_shift_prelu_store q0, a2, a13, q1, a14, a15, a8, a9 - - tie728_s16_unaligned_depthwise_conv2d_hwc1_prelu_c_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias - .type dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias, @function - # .section .iram1 -dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias: - .align 4 - entry sp, 128 - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - l32i a11, a4, 144 - l32i a15, a4, 60 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 124 - addi a6, a6, -16 # a6: dilation_x_offset - 16 - l32i a7, a4, 128 - addi a7, a7, -16 # a7: dilation_y_offset - 16 - # a9 - # a10 - l32i a11, a4, 148 # a11: filter_w_rs1_1 - l32i a12, a4, 100 # a12: c_div_x_1 - l32i a13, a4, 64 # a13: mac_shift - l32i a14, a4, 68 # a14: bias_ptr - # a15 - - - blti a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_c_remainder - l32i a8, a4, 132 - addi a8, a8, -16 # a8: next_hw81 - 16 - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a9 # a9: output_sar_byte - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_c_div_x: - beqi a9, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_128b - beqi a9, 8, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_64b - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_32b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_32b_multiple_loop: - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a15 - tie728_s16_32b_aligned_vector_shift_bias_store q0, a2, a13, q1, a14, a10 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_32b_multiple_loop - j tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_64b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_64b_multiple_loop: - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a15 - tie728_s16_64b_aligned_vector_shift_bias_store q0, a2, a13, q1, a14 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_64b_multiple_loop - j tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_128b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_128b_multiple_loop: - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a15 - tie728_s16_128b_aligned_vector_shift_bias_store q0, a2, a13, q1, a14 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_128b_multiple_loop - - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_c_remainder: - l32i a12, a4, 136 # a12: c_remainder - beqz a12, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_c_remainder_end - srli a8, a12, 1 - l32i a5, a4, 168 # filter_ptr unaligned - l32i a15, a4, 160 - tie728_s16_unaligned_depthwise_conv2d_hwr1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a9, a10, a11, a12, a4, a15 - tie728_s16_variable_vector_shift_bias_store q0, a2, a13, q1, a14, a8, a10 - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_c_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu - .type dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu, @function - # .section .iram1 -dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu: - .align 4 - entry sp, 128 - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 124 - addi a6, a6, -16 # a6: dilation_x_offset - 16 - l32i a7, a4, 128 - addi a7, a7, -16 # a7: dilation_y_offset - 16 - # a9 - # a10 - l32i a11, a4, 148 # a11: filter_w_rs1_1 - l32i a12, a4, 100 # a12: c_div_x_1 - l32i a13, a4, 64 # a13: mac_shift - l32i a14, a4, 68 # a14: bias_ptr - l32i a15, a4, 76 # a15: activation_alpha - - EE.MOVI.32.Q q7, a13, 3 - - - blti a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_c_remainder - l32i a8, a4, 132 - addi a8, a8, -16 # a8: next_hw81 - 16 - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a9 # a9: output_sar_byte - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_c_div_x: - beqi a9, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_128b - beqi a9, 8, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_64b - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_32b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_32b_multiple_loop: - EE.MOVI.32.A q7, a13, 1 - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a13 - l32i a9, a4, 84 # a9: activation_shift - EE.MOVI.32.A q7, a13, 3 - tie728_s16_32b_aligned_vector_shift_bias_relu_store q0, a2, a13, q1, a14, a15, a9, a10 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_32b_multiple_loop - j tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_64b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_64b_multiple_loop: - EE.MOVI.32.A q7, a13, 1 - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a13 - l32i a9, a4, 84 # a9: activation_shift - EE.MOVI.32.A q7, a13, 3 - tie728_s16_64b_aligned_vector_shift_bias_relu_store q0, a2, a13, q1, a14, a15, a9 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_64b_multiple_loop - j tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_128b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_128b_multiple_loop: - EE.MOVI.32.A q7, a13, 1 - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a13 - l32i a9, a4, 84 # a9: activation_shift - EE.MOVI.32.A q7, a13, 3 - tie728_s16_128b_aligned_vector_shift_bias_relu_store q0, a2, a13, q1, a14, a15, a9 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_128b_multiple_loop - - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_c_remainder: - l32i a12, a4, 136 # a12: c_remainder - beqz a12, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_c_remainder_end - srli a8, a12, 1 - l32i a5, a4, 168 # filter_ptr unaligned - l32i a13, a4, 160 - tie728_s16_unaligned_depthwise_conv2d_hwr1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a9, a10, a11, a12, a4, a13 - l32i a9, a4, 84 # a9: activation_shift - EE.MOVI.32.A q7, a13, 3 - tie728_s16_variable_vector_shift_bias_relu_store q0, a2, a13, q1, a14, a15, a9, a8, a10 - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_relu_c_remainder_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu - .type dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu, @function - # .section .iram1 -dl_tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu: - .align 4 - entry sp, 128 - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - l32i a5, a4, 48 # a5: filter_ptr - l32i a6, a4, 124 - addi a6, a6, -16 # a6: dilation_x_offset - 16 - l32i a7, a4, 128 - addi a7, a7, -16 # a7: dilation_y_offset - 16 - # a9 - # a10 - l32i a11, a4, 148 # a11: filter_w_rs1_1 - l32i a12, a4, 100 # a12: c_div_x_1 - l32i a13, a4, 64 # a13: mac_shift - l32i a14, a4, 68 # a14: bias_ptr - l32i a15, a4, 80 # a15: activation_alpha_ptr - - EE.MOVI.32.Q q7, a13, 3 - - blti a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_c_remainder - l32i a8, a4, 132 - addi a8, a8, -16 # a8: next_hw81 - 16 - - EE.LD.128.USAR.IP q0, a2, 0 - rur.sar_byte a9 # a9: output_sar_byte - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_c_div_x: - beqi a9, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_128b - beqi a9, 8, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_64b - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_32b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_32b_multiple_loop: - EE.MOVI.32.A q7, a13, 1 - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a13 - l32i a9, a4, 84 # a9: activation_shift - EE.MOVI.32.A q7, a13, 3 - tie728_s16_32b_aligned_vector_shift_bias_prelu_store q0, a2, a13, q1, a14, q2, a15, a9, a10 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_32b_multiple_loop - j tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_64b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_64b_multiple_loop: - EE.MOVI.32.A q7, a13, 1 - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a13 - l32i a9, a4, 84 # a9: activation_shift - EE.MOVI.32.A q7, a13, 3 - tie728_s16_64b_aligned_vector_shift_bias_prelu_store q0, a2, a13, q1, a14, q2, a15, a9 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_64b_multiple_loop - j tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_c_remainder - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_128b: - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_128b_multiple_loop: - EE.MOVI.32.A q7, a13, 1 - tie728_s16_unaligned_depthwise_conv2d_hw81 q0, q1, q2, q3, a3, a5, a6, a7, a8, a9, a10, a11, a4, q7, a13 - l32i a9, a4, 84 # a9: activation_shift - EE.MOVI.32.A q7, a13, 3 - tie728_s16_128b_aligned_vector_shift_bias_prelu_store q0, a2, a13, q1, a14, q2, a15, a9 - - addi a12, a12, -1 - bgei a12, 0, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_128b_multiple_loop - - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_c_remainder: - l32i a12, a4, 136 # a12: c_remainder - beqz a12, tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_c_remainder_end - srli a8, a12, 1 - l32i a5, a4, 168 # filter_ptr unaligned - l32i a13, a4, 160 - tie728_s16_unaligned_depthwise_conv2d_hwr1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a9, a10, a11, a12, a4, a13 - l32i a9, a4, 84 # a9: activation_shift - EE.MOVI.32.A q7, a13, 3 - tie728_s16_variable_vector_shift_bias_prelu_store q0, a2, a13, q1, a14, q2, a15, a9, a8, a10 - - tie728_s16_unaligned_depthwise_conv2d_hwc1_bias_prelu_c_remainder_end: - - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8.S deleted file mode 100644 index 58638708..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8.S +++ /dev/null @@ -1,518 +0,0 @@ -#pragma once - -.macro dl_tie728_s8_unaligned_store0 output_v, output_ptr, tmp32 - EE.MOVI.32.A \output_v, \tmp32, 0 - s32i \tmp32, \output_ptr, 0 - EE.MOVI.32.A \output_v, \tmp32, 1 - s32i \tmp32, \output_ptr, 4 - EE.MOVI.32.A \output_v, \tmp32, 2 - s32i \tmp32, \output_ptr, 8 - EE.MOVI.32.A \output_v, \tmp32, 3 - s32i \tmp32, \output_ptr, 12 - addi \output_ptr, \output_ptr, 16 -.endm - -.macro dl_tie728_s8_unaligned_store1 output_v, output_ptr - EE.VST.L.64.IP \output_v, \output_ptr, 8 - EE.VST.H.64.IP \output_v, \output_ptr, 8 -.endm - - -.macro dl_tie728_s8_last_store_data tmp_q, output_v, tmp_a, c_remainder_bytes - movi \tmp_a, 15 - sub \tmp_a, \tmp_a, \c_remainder_bytes - movi \c_remainder_bytes, 0 - EE.SLCXXP.2Q \tmp_q, \output_v, \tmp_a, \c_remainder_bytes #left shift to make the rest part 0 - EE.SRCXXP.2Q \output_v, \tmp_q, \tmp_a, \c_remainder_bytes #right shift to lower bits -.endm - - -.macro dl_tie728_s8_store_remainder output_v, tmp_a0, tmp_a1, tmp_a2, tmp_a3, output_ptr, remainder_c - EE.MOVI.32.A \output_v, \tmp_a0, 0 -615: # remainder_c == 15, 0x1111 - bbci \remainder_c, 3, 607f - EE.MOVI.32.A \output_v, \tmp_a1, 1 - bbci \remainder_c, 2, 611f - EE.MOVI.32.A \output_v, \tmp_a2, 2 - bbci \remainder_c, 1, 613f - EE.MOVI.32.A \output_v, \tmp_a3, 3 - bbci \remainder_c, 0, 614f - - s32i \tmp_a0, \output_ptr, 0 - s32i \tmp_a1, \output_ptr, 4 - s32i \tmp_a2, \output_ptr, 8 - s16i \tmp_a3, \output_ptr, 12 - srai \tmp_a3, \tmp_a3, 16 - s8i \tmp_a3, \output_ptr, 14 - j 616f - -614: # remainder_c == 14, 0x1110 - s32i \tmp_a0, \output_ptr, 0 - s32i \tmp_a1, \output_ptr, 4 - s32i \tmp_a2, \output_ptr, 8 - s16i \tmp_a3, \output_ptr, 12 - j 616f - -613: # remainder_c == 13, 0x1101 - bbci \remainder_c, 0, 612f - EE.MOVI.32.A \output_v, \tmp_a3, 3 - s32i \tmp_a0, \output_ptr, 0 - s32i \tmp_a1, \output_ptr, 4 - s32i \tmp_a2, \output_ptr, 8 - s8i \tmp_a3, \output_ptr, 12 - j 616f - -612: # remainder_c == 12, 0x1100 - s32i \tmp_a0, \output_ptr, 0 - s32i \tmp_a1, \output_ptr, 4 - s32i \tmp_a2, \output_ptr, 8 - j 616f - -611: # remainder_c == 11, 0x1011 - bbci \remainder_c, 1, 609f - EE.MOVI.32.A \output_v, \tmp_a2, 2 - bbci \remainder_c, 0, 610f - s32i \tmp_a0, \output_ptr, 0 - s32i \tmp_a1, \output_ptr, 4 - s16i \tmp_a2, \output_ptr, 8 - srai \tmp_a2, \tmp_a2, 16 - s8i \tmp_a2, \output_ptr, 10 - j 616f -610: # remainder_c == 10, 0x1010 - s32i \tmp_a0, \output_ptr, 0 - s32i \tmp_a1, \output_ptr, 4 - s16i \tmp_a2, \output_ptr, 8 - j 616f -609: # remainder_c == 9, 0x1001 - bbci \remainder_c, 0, 608f - EE.MOVI.32.A \output_v, \tmp_a2, 2 - s32i \tmp_a0, \output_ptr, 0 - s32i \tmp_a1, \output_ptr, 4 - s8i \tmp_a2, \output_ptr, 8 - j 616f -608: # remainder_c == 8, 0x1000 - s32i \tmp_a0, \output_ptr, 0 - s32i \tmp_a1, \output_ptr, 4 - j 616f - -607: # remainder == 7, 0x111 - bbci \remainder_c, 2, 603f - bbci \remainder_c, 1, 605f - EE.MOVI.32.A \output_v, \tmp_a1, 1 - bbci \remainder_c, 0, 606f - s32i \tmp_a0, \output_ptr, 0 - s16i \tmp_a1, \output_ptr, 4 - srai \tmp_a1, \tmp_a1, 16 - s8i \tmp_a1, \output_ptr, 6 - j 616f - -606: # remainder == 6, 0x110 - s32i \tmp_a0, \output_ptr, 0 - s16i \tmp_a1, \output_ptr, 4 - j 616f - -605: # remainder == 4, 5 - bbci \remainder_c, 0, 604f - # remainder == 5, 0x101 - EE.MOVI.32.A \output_v, \tmp_a1, 1 - s32i \tmp_a0, \output_ptr, 0 - s8i \tmp_a1, \output_ptr, 4 - j 616f - -604: # remainder == 4, 0x100 - s32i \tmp_a0, \output_ptr, 0 - j 616f - -603: # remainder == 1, 2, 3 - bbci \remainder_c, 1, 601f - bbci \remainder_c, 0, 602f - # remainder == 3, 0x011 - s16i \tmp_a0, \output_ptr, 0 - srai \tmp_a0, \tmp_a0, 16 - s8i \tmp_a0, \output_ptr, 2 - j 616f - -602: # remainder == 2, 0x010 - s16i \tmp_a0, \output_ptr, 0 - j 616f - -601: # remainder == 1, 0x001 - s8i \tmp_a0, \output_ptr, 0 - -616: -.endm - - -############################################################################################################################################################ -# result process for Conv2D / Depthwise_Conv2D -############################################################################################################################################################ - - -.macro tie728_s8_conv2d_per_layer_result output_v mac_shift - EE.SRCMB.S8.QACC \output_v, \mac_shift, 0 -.endm - - -.macro tie728_s8_vector_round_result output_v mac_shift tmp tmp_q1 - beqz \mac_shift, 500f - - MOVI \tmp, 257 // 0000 0000 0000 0000 0000 0001 0000 0001 - EE.MOVI.32.Q \output_v, \tmp, 0 - EE.MOVI.32.Q \output_v, \tmp, 1 - EE.MOVI.32.Q \output_v, \tmp, 2 - EE.MOVI.32.Q \output_v, \tmp, 3 - movi.n \tmp, 16 - wsr.sar \tmp - EE.VSL.32 \tmp_q1, \output_v - EE.ORQ \tmp_q1, \tmp_q1, \output_v // 0000 0001 0000 0001 0000 0001 0000 0001 - - addi \tmp, \mac_shift, -1 - EE.SRCMB.S8.QACC \output_v, \tmp, 0 - - movi.n \tmp, 1 - EE.MOVI.32.Q \output_v, \tmp, 0 - EE.VSMULAS.S8.QACC \tmp_q1, \output_v, 0 // qacc[0:16] += round - EE.SRCMB.S8.QACC \output_v, \tmp, 0 - j 501f -500: - EE.SRCMB.S8.QACC \output_v, \mac_shift, 0 -501: -.endm - - -.macro tie728_s8_element_round_result output mac_shift tmp tmp_q1 - beqz \mac_shift, 505f - - addi \tmp, \mac_shift, -1 - EE.SRS.ACCX \output, \tmp, 0 - - movi.n \tmp, 1 - EE.ZERO.Q \tmp_q1 - EE.MOVI.32.Q \tmp_q1, \tmp, 0 - EE.VMULAS.S8.ACCX \tmp_q1, \tmp_q1 - EE.SRS.ACCX \output, \tmp, 0 - j 506f -505: - EE.SRS.ACCX \output, \mac_shift, 0 -506: -.endm - - -# what if a1 not 16 byte aligned? -.macro tie728_s8_conv2d_per_channel_result output_v scale_q scale_factor tmp tmp_q1 - # entry need to be 128 - movi \tmp, 4 - EE.SRCMB.S16.QACC \output_v, \tmp, 0 # get the lower 16 bit in QACC - - mov \tmp, a1 - EE.ST.QACC_L.L.128.IP \tmp, 16 - EE.ST.QACC_L.H.32.IP \tmp, 16 - EE.ST.QACC_H.L.128.IP \tmp, 16 - EE.ST.QACC_H.H.32.IP \tmp, 4 - - movi \tmp, 20 - EE.SRCMB.S16.QACC \tmp_q1, \tmp, 0 # \tmp_q1: even 16 bit - - l16si \tmp, a1, 5 #re-arrange qacc odd 16 bit - s16i \tmp, a1, 2 - l16si \tmp, a1, 10 - s16i \tmp, a1, 4 - l16si \tmp, a1, 15 - s16i \tmp, a1, 6 - l16si \tmp, a1, 32 - s16i \tmp, a1, 8 - l16si \tmp, a1, 37 - s16i \tmp, a1, 10 - l16si \tmp, a1, 42 - s16i \tmp, a1, 12 - l16si \tmp, a1, 47 - s16i \tmp, a1, 14 - - EE.VLD.128.IP \output_v, a1, 0 # \output_v: odd 16 bit - - EE.VZIP.16 \output_v, \tmp_q1 - - movi \tmp, 11 - ssr \tmp - - EE.VLD.128.IP \scale_q, \scale_factor, 16 # scale factor - movi \tmp, 0 - EE.VMUL.S16 \output_v, \output_v, \scale_q - - EE.VLD.128.IP \scale_q, \scale_factor, 16 - EE.MOV.S16.QACC \output_v - EE.VMUL.S16 \tmp_q1, \tmp_q1, \scale_q - - EE.SRCMB.S8.QACC \output_v, \tmp, 0 - EE.MOV.S16.QACC \tmp_q1 - EE.SRCMB.S8.QACC \tmp_q1, \tmp, 0 - EE.VUNZIP.8 \output_v, \tmp_q1 -.endm - - -.macro tie728_s8_conv2d_per_channel_with_bias_result output_v scale_q scale_factor bias_ptr tmp tmp_q1 - # entry need to be 128 - movi \tmp, 4 - EE.SRCMB.S16.QACC \output_v, \tmp, 0 # get the lower 16 bit in QACC - - mov \tmp, a1 - EE.ST.QACC_L.L.128.IP \tmp, 16 - EE.ST.QACC_L.H.32.IP \tmp, 16 - EE.ST.QACC_H.L.128.IP \tmp, 16 - EE.ST.QACC_H.H.32.IP \tmp, 4 - - movi \tmp, 20 - EE.SRCMB.S16.QACC \tmp_q1, \tmp, 0 # \tmp_q1: even 16 bit - - l16si \tmp, a1, 5 #re-arrange qacc odd 16 bit - s16i \tmp, a1, 2 - l16si \tmp, a1, 10 - s16i \tmp, a1, 4 - l16si \tmp, a1, 15 - s16i \tmp, a1, 6 - l16si \tmp, a1, 32 - s16i \tmp, a1, 8 - l16si \tmp, a1, 37 - s16i \tmp, a1, 10 - l16si \tmp, a1, 42 - s16i \tmp, a1, 12 - l16si \tmp, a1, 47 - s16i \tmp, a1, 14 - - EE.VLD.128.IP \output_v, a1, 0 # \output_v: odd 16 bit - - EE.VZIP.16 \output_v, \tmp_q1 - - - EE.VLD.128.IP \scale_q, \bias_ptr, 16 # load bias - movi \tmp, 11 - EE.VADDS.S16 \output_v, \output_v, \scale_q # add int16 bias with exponent(input+filter-4) - - - EE.VLD.128.IP \scale_q, \bias_ptr, 16 # load bias - ssr \tmp - EE.VADDS.S16 \tmp_q1, \tmp_q1, \scale_q - - - EE.VLD.128.IP \scale_q, \scale_factor, 16 # scale factor - movi \tmp, 0 - EE.VMUL.S16 \output_v, \output_v, \scale_q - - EE.VLD.128.IP \scale_q, \scale_factor, 16 - EE.MOV.S16.QACC \output_v - EE.VMUL.S16 \tmp_q1, \tmp_q1, \scale_q - - EE.SRCMB.S8.QACC \output_v, \tmp, 0 - EE.MOV.S16.QACC \tmp_q1 - EE.SRCMB.S8.QACC \tmp_q1, \tmp, 0 - EE.VUNZIP.8 \output_v, \tmp_q1 - -.endm - - -.macro tie728_s8_conv2d_128b_vector_bias bias_ptr - EE.LD.QACC_L.L.128.IP \bias_ptr, 16 - EE.LD.QACC_L.H.32.IP \bias_ptr, 16 - EE.LD.QACC_H.L.128.IP \bias_ptr, 16 - EE.LD.QACC_H.H.32.IP \bias_ptr, 16 -.endm - - -.macro tie728_s8_conv2d_element_bias bias_ptr - EE.LD.ACCX.IP \bias_ptr, 8 -.endm - - -.macro tie728_s8_conv2d_bias output_v bias_v bias_ptr - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - - # bias - EE.VADDS.S8 \output_v, \output_v, \bias_v -.endm - - - -.macro tie728_s8_conv2d_bias_relu output_v bias_v bias_ptr activation_alpha activation_shift - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - - # bias - EE.VADDS.S8 \output_v, \output_v, \bias_v - - # LeakyReLU - EE.VRELU.S8 \output_v, \activation_alpha, \activation_shift -.endm - - - -.macro tie728_s8_conv2d_bias_prelu output_v bias_v bias_ptr activation_v activation_alpha_ptr activation_shift - EE.VLD.128.IP \bias_v, \bias_ptr, 16 # load bias - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 16 # load PReLU alph - - # bias - EE.VADDS.S8 \output_v, \output_v, \bias_v - - # PReLU - EE.VPRELU.S8 \output_v, \output_v, \activation_v, \activation_shift -.endm - - - -.macro tie728_s8_conv2d_relu output_v activation_alpha activation_shift - - # LeakyReLU - EE.VRELU.S8 \output_v, \activation_alpha, \activation_shift - -.endm - - - -.macro tie728_s8_conv2d_prelu output_v activation_v activation_alpha_ptr activation_shift - EE.VLD.128.IP \activation_v, \activation_alpha_ptr, 16 # load PReLU alph - - # PReLU - EE.VPRELU.S8 \output_v, \output_v, \activation_v, \activation_shift -.endm - - - - -############################################################################################################################################################ -#### -#### tie728_s8_unaligned_conv2d / depthwise_conv2d -#### -############################################################################################################################################################ - - - -.macro tie728_s8_unaligned_conv2d_operation_type operation_type mac_shift bias_ptr activation_shift activation_alpha args - movi \operation_type, 0 -0: // per-layer or per-channel - bltz \mac_shift, 1f - j 2f - 1: //per_channel - l32i \mac_shift, \args, 104 // filter_channel_factor address - addi \operation_type, \operation_type, 6 - -2: // bias - beqz \bias_ptr, 3f // no bias - addi \operation_type, \operation_type, 3 - -3: // activation dl_tie728_s8_unaligned_conv2d_activation - bltz \activation_shift, 5f // no activation - - beqz \activation_alpha, 4f - addi \operation_type, \operation_type, 2 - j 5f - - 4: - l32i \activation_alpha, \args, 76 // load activation_alpha - addi \operation_type, \operation_type, 1 // Relu or LeakyReLU - -5: //put operation type in \operation_type - -.endm - - - -.macro tie728_s8_conv2d_1_1_unaligned_c_result operation_type output_v mac_shift bias_ptr activation_alpha activation_shift tmp tmp_q1 tmp_q2 - - bbci \operation_type, 3, 7f - bbci \operation_type, 2, 11f -11: # per_channel bias + prelu, 0x1011 - bbci \operation_type, 1, 9f - bbci \operation_type, 0, 10f - - # tie728_s8_conv2d_per_channel_result \output_v, \tmp_q2, \mac_shift, \tmp, \tmp_q1 - # tie728_s8_conv2d_bias_prelu \output_v, \tmp_q1, \bias_ptr, \tmp_q2, \activation_alpha, \activation_shift - - tie728_s8_conv2d_per_channel_with_bias_result \output_v, \tmp_q2, \mac_shift, \bias_ptr, \tmp, \tmp_q1 - tie728_s8_conv2d_prelu \output_v, \tmp_q2, \activation_alpha, \activation_shift - - j 16f # jump to 16f -10: # per_channel bias + relu, 0x1010 - # tie728_s8_conv2d_per_channel_result \output_v, \tmp_q2, \mac_shift, \tmp, \tmp_q1 - # tie728_s8_conv2d_bias_relu \output_v, \tmp_q1, \bias_ptr, \activation_alpha, \activation_shift - - tie728_s8_conv2d_per_channel_with_bias_result \output_v, \tmp_q2, \mac_shift, \bias_ptr, \tmp, \tmp_q1 - tie728_s8_conv2d_relu \output_v, \activation_alpha, \activation_shift - j 16f # jump to 16f - -9: # per_channel bias, 0x1001 - bbci \operation_type, 0, 8f - # tie728_s8_conv2d_per_channel_result \output_v, \tmp_q2, \mac_shift, \tmp, \tmp_q1 - # tie728_s8_conv2d_bias \output_v, \tmp_q1, \bias_ptr - tie728_s8_conv2d_per_channel_with_bias_result \output_v, \tmp_q2, \mac_shift, \bias_ptr, \tmp, \tmp_q1 - - j 16f # jump to 16f -8: #per_channel no_bias + prelu, 0x1000 - tie728_s8_conv2d_per_channel_result \output_v, \tmp_q2, \mac_shift, \tmp, \tmp_q1 - tie728_s8_conv2d_prelu \output_v, \tmp_q2, \activation_alpha, \activation_shift - - j 16f # jump to 16f - -7: # per_channel no_bias + relu, 0x111 - bbci \operation_type, 2, 3f - bbci \operation_type, 1, 5f - bbci \operation_type, 0, 6f - - tie728_s8_conv2d_per_channel_result \output_v, \tmp_q2, \mac_shift, \tmp, \tmp_q1 - tie728_s8_conv2d_relu \output_v, \activation_alpha, \activation_shift - - j 16f # jump to 16f - -6: # per_channel no_bias, 0x110 - tie728_s8_conv2d_per_channel_result \output_v, \tmp_q2, \mac_shift, \tmp, \tmp_q1 - - j 16f # jump to 16f - -5: # remainder == 4, 5 - bbci \operation_type, 0, 4f - # per_layer bias + prelu, 0x101 - # tie728_s8_conv2d_per_layer_result \output_v, \mac_shift - tie728_s8_vector_round_result \output_v, \mac_shift, \tmp, \tmp_q1 - # tie728_s8_conv2d_bias_prelu \output_v, \tmp_q1, \bias_ptr, \tmp_q2, \activation_alpha, \activation_shift - # bias will be preload - tie728_s8_conv2d_prelu \output_v, \tmp_q2, \activation_alpha, \activation_shift - - j 16f # jump to 16f - -4: # per_layer bias + relu, 0x100 - # tie728_s8_conv2d_per_layer_result \output_v, \mac_shift - tie728_s8_vector_round_result \output_v, \mac_shift, \tmp, \tmp_q1 - # tie728_s8_conv2d_bias_relu \output_v, \tmp_q1, \bias_ptr, \activation_alpha, \activation_shift - # bias will be preload - tie728_s8_conv2d_relu \output_v, \activation_alpha, \activation_shift - - j 16f # jump to 16f - -3: # remainder == 1, 2, 3 - bbci \operation_type, 1, 1f - bbci \operation_type, 0, 2f - # per_layer bias, 0x011 - # tie728_s8_conv2d_per_layer_result \output_v, \mac_shift - tie728_s8_vector_round_result \output_v, \mac_shift, \tmp, \tmp_q1 - # bias will be preload - # tie728_s8_conv2d_bias \output_v, \tmp_q1, \bias_ptr - - j 16f # jump to 16f - -2: # per_layer no_bias + prelu, 0x010 - # tie728_s8_conv2d_per_layer_result \output_v, \mac_shift - tie728_s8_vector_round_result \output_v, \mac_shift, \tmp, \tmp_q1 - tie728_s8_conv2d_prelu \output_v, \tmp_q2, \activation_alpha, \activation_shift - - j 16f # jump to 16f - -1: # no_bias + relu, 0x001 - bbci \operation_type, 0, 0f - # tie728_s8_conv2d_per_layer_result \output_v, \mac_shift - tie728_s8_vector_round_result \output_v, \mac_shift, \tmp, \tmp_q1 - tie728_s8_conv2d_relu \output_v, \activation_alpha, \activation_shift - - j 16f # jump to 16f - -0: # per_layer no_bias, 0x000 - # tie728_s8_conv2d_per_layer_result \output_v, \mac_shift - tie728_s8_vector_round_result \output_v, \mac_shift, \tmp, \tmp_q1 -16: -.endm diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_add2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_add2d.S deleted file mode 100644 index b67f2d9b..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_add2d.S +++ /dev/null @@ -1,1517 +0,0 @@ -#include "dl_tie728_s8.S" - -############################################################################################################################################################ -#### -#### tie728_s8_add2d_11c series -#### -############################################################################################################################################################ - -.macro dl_tie728_rescale_add_rescale_output input0, input1, output, output_scale, output_shift - EE.ZERO.QACC - EE.VMULAS.S8.QACC \input0, \output_scale - EE.VMULAS.S8.QACC \input1, \output_scale - EE.SRCMB.S8.QACC \output, \output_shift, 0 -.endm - - - - .align 4 - .text - .global dl_tie728_s8_add2d_11c - .type dl_tie728_s8_add2d_11c, @function - # .section .iram1 -dl_tie728_s8_add2d_11c: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_2x_1 - # a7: c_left_x_1 - - l32i a6, a5, 68 - l32i a7, a5, 72 - - blti a6, 1, dl_tie728_s8_add2d_small_channel - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.VLD.128.IP q2, a3, 16 - EE.VADDS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VADDS.S8.LD.INCP q1, a4, q5, q2, q3 - EE.VST.128.IP q5, a2, 16 - 0: - - beqi a7, 1, 2f #remainder == 2*16byte - beqi a7, 2, 3f #remainder == 3*16byte - - 2: - EE.VLD.128.IP q2, a3, 16 - EE.VADDS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - - EE.VADDS.S8 q5, q2, q3 - EE.VST.128.IP q5, a2, 16 - retw - - 3: - EE.VLD.128.IP q2, a3, 16 - EE.VADDS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VADDS.S8.LD.INCP q1, a4, q5, q2, q3 - EE.VST.128.IP q5, a2, 16 - - EE.VADDS.S8 q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - - retw - - -dl_tie728_s8_add2d_small_channel: # channel < 3*s (16) - - loopgtz a7, 0f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - EE.VADDS.S8 q2, q0, q1 - EE.VST.128.IP q2, a2, 16 - 0: - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - EE.VADDS.S8 q2, q0, q1 - EE.VST.128.IP q2, a2, 16 - retw - - - - - .align 4 - .text - .global dl_tie728_s8_rescale_add2d_11c - .type dl_tie728_s8_rescale_add2d_11c, @function - # .section .iram1 -dl_tie728_s8_rescale_add2d_11c: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr: >> shift or *scale) >> shift - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr: input1 >> shift + input0 * 1 - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - - - l32i a6, a5, 64 - l32i a7, a5, 88 - l32i a8, a5, 96 - l32i a9, a5, 92 - - beqi a8, 1, dl_tie728_s8_rescale_add2d_output - -dl_tie728_s8_rescale_add2d_output_scale: # *scale) >> shift - - s8i a8, a1, 0 - EE.VLDBC.8 q7, a1 # all output_scale - - loopgtz a6, 0f - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_add_rescale_output q0, q1, q1, q7, a9 - - EE.VST.128.IP q1, a2, 16 - 0: - - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_add_rescale_output q0, q1, q1, q7, a9 - EE.VST.128.IP q1, a2, 16 - retw - - - -dl_tie728_s8_rescale_add2d_output: # >> shift - movi a13, 1 - s8i a13, a1, 0 - EE.VLDBC.8 q7, a1 # all 1 - - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - loopgtz a6, 1f - EE.SRCMB.S8.QACC q1, a7, 0 - EE.VMULAS.S8.QACC.LD.IP q0, a3, 16, q0, q7 - EE.SRCMB.S8.QACC q1, a9, 0 - EE.LDQA.S8.128.IP a4, 16 - EE.VST.128.IP q1, a2, 16 - 1: - - EE.SRCMB.S8.QACC q1, a7, 0 - EE.VMULAS.S8.QACC q0, q7 - EE.SRCMB.S8.QACC q1, a9, 0 - EE.VST.128.IP q1, a2, 16 - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_add2d_11c_relu - .type dl_tie728_s8_add2d_11c_relu, @function - # .section .iram1 -dl_tie728_s8_add2d_11c_relu: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_2x_1 - # a7: c_left_x_1 - # a14: activation_alpha - # a15: activation_shift - - - l32i a6, a5, 68 - l32i a7, a5, 72 - l32i a14, a5, 52 - l32i a15, a5, 60 - - blti a6, 1, dl_tie728_s8_add2d_relu_small_channel - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.VLD.128.IP q2, a3, 16 - EE.VADDS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VRELU.S8 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VADDS.S8.LD.INCP q1, a4, q5, q2, q3 - EE.VRELU.S8 q5, a14, a15 - EE.VST.128.IP q5, a2, 16 - 0: - - beqi a7, 1, 2f #remainder == 2*16byte - beqi a7, 2, 3f #remainder == 3*16byte - - 2: - EE.VLD.128.IP q2, a3, 16 - EE.VADDS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VRELU.S8 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VADDS.S8 q5, q2, q3 - EE.VRELU.S8 q5, a14, a15 - EE.VST.128.IP q5, a2, 16 - retw - - - 3: - EE.VLD.128.IP q2, a3, 16 - EE.VADDS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VRELU.S8 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VADDS.S8.LD.INCP q1, a4, q5, q2, q3 - EE.VRELU.S8 q5, a14, a15 - EE.VST.128.IP q5, a2, 16 - - EE.VADDS.S8 q4, q0, q1 - EE.VRELU.S8 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - retw - - -dl_tie728_s8_add2d_relu_small_channel: # channel < 3*16byte - - loopgtz a7, 0f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - EE.VADDS.S8 q2, q0, q1 - - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - 0: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - EE.VADDS.S8 q2, q0, q1 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_rescale_add2d_11c_relu - .type dl_tie728_s8_rescale_add2d_11c_relu, @function - # .section .iram1 -dl_tie728_s8_rescale_add2d_11c_relu: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr: >> shift or *scale) >> shift - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr: input1 >> shift + input0 * 1 - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a14: activation_alpha - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 88 - l32i a8, a5, 96 - l32i a9, a5, 92 - l32i a14, a5, 52 - l32i a15, a5, 60 - - - beqi a8, 1, dl_tie728_s8_rescale_add2d_output_relu - -dl_tie728_s8_rescale_add2d_output_scale_relu: # *scale) >> shift - - s8i a8, a1, 0 - EE.VLDBC.8 q7, a1 # all output_scale - - loopgtz a6, 0f #dl_tie728_s8_rescale_add2d_11c_output_relu - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_add_rescale_output q0, q1, q1, q7, a9 - - EE.VRELU.S8 q1, a14, a15 - - EE.VST.128.IP q1, a2, 16 - 0: - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_add_rescale_output q0, q1, q1, q7, a9 - EE.VRELU.S8 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - - retw - - - - -dl_tie728_s8_rescale_add2d_output_relu: # >> shift - movi a13, 1 - s8i a13, a1, 0 - EE.VLDBC.8 q7, a1 # all 1 - - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - loopgtz a6, 0f - EE.SRCMB.S8.QACC q1, a7, 0 - EE.VMULAS.S8.QACC.LD.IP q0, a3, 16, q0, q7 - EE.SRCMB.S8.QACC q1, a9, 0 - EE.VRELU.S8 q1, a14, a15 - EE.LDQA.S8.128.IP a4, 16 - EE.VST.128.IP q1, a2, 16 - 0: - - EE.SRCMB.S8.QACC q1, a7, 0 - EE.VMULAS.S8.QACC q0, q7 - EE.SRCMB.S8.QACC q1, a9, 0 - - EE.VRELU.S8 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_add2d_11c_prelu - .type dl_tie728_s8_add2d_11c_prelu, @function - # .section .iram1 -dl_tie728_s8_add2d_11c_prelu: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_2x_1 - # a7: c_left_x_1 - # a14: activation_alpha_ptr - # a15: activation_shift - - - l32i a6, a5, 68 - l32i a7, a5, 72 - l32i a14, a5, 56 - l32i a15, a5, 60 - - blti a6, 1, dl_tie728_s8_add2d_prelu_small_channel - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.VLD.128.IP q2, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VPRELU.S8 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S8.LD.INCP q1, a4, q5, q2, q3 - EE.VPRELU.S8 q5, q5, q6, a15 - EE.VST.128.IP q5, a2, 16 - 0: - - beqi a7, 1, 2f #remainder == 2*16byte - beqi a7, 2, 3f #remainder == 3*16byte - - 2: - EE.VLD.128.IP q2, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VPRELU.S8 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S8 q5, q2, q3 - - EE.VPRELU.S8 q5, q5, q6, a15 - EE.VST.128.IP q5, a2, 16 - retw - - 3: - EE.VLD.128.IP q2, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VPRELU.S8 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S8.LD.INCP q1, a4, q5, q2, q3 - EE.VPRELU.S8 q5, q5, q6, a15 - EE.VST.128.IP q5, a2, 16 - - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S8 q4, q0, q1 - - EE.VPRELU.S8 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - retw - - -dl_tie728_s8_add2d_prelu_small_channel: # channel < 3*s - - loopgtz a7, 0f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - EE.VLD.128.IP q3, a14, 16 - EE.VADDS.S8 q2, q0, q1 - - EE.VPRELU.S8 q2, q2, q3, a15 - EE.VST.128.IP q2, a2, 16 - 0: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - EE.VLD.128.IP q3, a14, 16 - EE.VADDS.S8 q2, q0, q1 - - EE.VPRELU.S8 q2, q2, q3, a15 - EE.VST.128.IP q2, a2, 16 - - retw - - - - .align 4 - .text - .global dl_tie728_s8_rescale_add2d_11c_prelu - .type dl_tie728_s8_rescale_add2d_11c_prelu, @function - # .section .iram1 -dl_tie728_s8_rescale_add2d_11c_prelu: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr: >> shift or *scale) >> shift - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr: input1 >> shift + input0 * 1 - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a14: activation_alpha_ptr - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 88 - l32i a8, a5, 96 - l32i a9, a5, 92 - l32i a14, a5, 56 - l32i a15, a5, 60 - - - beqi a8, 1, dl_tie728_s8_rescale_add2d_output_prelu - - -dl_tie728_s8_rescale_add2d_output_scale_prelu: # *scale) >> shift - - s8i a8, a1, 0 - EE.VLDBC.8 q7, a1 # all output_scale - - loopgtz a6, 0f - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.VLD.128.IP q5, a14, 16 - dl_tie728_rescale_add_rescale_output q0, q1, q1, q7, a9 - - EE.VPRELU.S8 q1, q1, q5, a15 - - EE.VST.128.IP q1, a2, 16 - 0: - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.VLD.128.IP q5, a14, 16 - dl_tie728_rescale_add_rescale_output q0, q1, q1, q7, a9 - - EE.VPRELU.S8 q1, q1, q5, a15 - EE.VST.128.IP q1, a2, 16 - - retw - - -dl_tie728_s8_rescale_add2d_output_prelu: # >> shift - movi a13, 1 - s8i a13, a1, 0 - EE.VLDBC.8 q7, a1 # all 1 - - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - loopgtz a6, 0f - EE.SRCMB.S8.QACC q1, a7, 0 - EE.VMULAS.S8.QACC.LD.IP q0, a3, 16, q0, q7 - - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S8.QACC q1, a9, 0 - EE.VPRELU.S8 q1, q1, q6, a15 - EE.LDQA.S8.128.IP a4, 16 - EE.VST.128.IP q1, a2, 16 - 0: - - EE.SRCMB.S8.QACC q1, a7, 0 - EE.VMULAS.S8.QACC q0, q7 - - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S8.QACC q1, a9, 0 - - EE.VPRELU.S8 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 - - retw - - - - - - -############################################################################################################################################################ -#### -#### tie728_s8_unaligned_add2d_11c series -#### -############################################################################################################################################################ - - .align 4 - .text - .global dl_tie728_s8_unaligned_add2d_11c - .type dl_tie728_s8_unaligned_add2d_11c, @function - # .section .iram1 -dl_tie728_s8_unaligned_add2d_11c: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a10: c_remainder - - l32i a6, a5, 64 - l32i a10, a5, 76 - l32i a7, a5, 88 - - bgei a7, 0, dl_tie728_s8_unaligned_rescale_add2d_11c - -# input0 exp = input1 exp = output exp - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie728_s8_unaligned_add2d_11c_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie728_s8_unaligned_add2d_11c_0 - beqi a13, 8, dl_tie728_s8_unaligned_add2d_11c_1 - - - loopgtz a6, 0f #dl_tie728_s8_unaligned_add2d_11c - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie728_s8_unaligned_add2d_11c_remainder - - #output sar = 0 - dl_tie728_s8_unaligned_add2d_11c_0: - loopgtz a6, 1f #dl_tie728_s8_unaligned_add2d_11c_loop0 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - EE.VST.128.IP q2, a2, 16 - j dl_tie728_s8_unaligned_add2d_11c_remainder - - # #output sar = 8 - dl_tie728_s8_unaligned_add2d_11c_1: - loopgtz a6, 2f #dl_tie728_s8_unaligned_add2d_11c_loop1 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - dl_tie728_s8_unaligned_store1 q2, a2 - j dl_tie728_s8_unaligned_add2d_11c_remainder - -dl_tie728_s8_unaligned_add2d_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 - -dl_tie728_s8_unaligned_add2d_11c_remainder: - - beqz a10, dl_tie728_s8_unaligned_add2d_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_add2d_end: - - retw - - -## rescaled add -dl_tie728_s8_unaligned_rescale_add2d_11c: - l32i a8, a5, 96 # output_scale - l32i a9, a5, 92 # output_shift - - beqi a8, 1, dl_tie728_s8_rescale_unaligned_add2d_output_shift - - -### rescaled to output by *scale) >> shift -dl_tie728_s8_rescale_unaligned_add2d_output_scale: - - s8i a8, a1, 0 - EE.VLDBC.8 q7, a1 # all output_scale - - blti a6, 0, dl_tie728_s8_rescale_unaligned_add2d_scale_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 3f #dl_tie728_s8_rescale_unaligned_add2d_11c_scale - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_add_rescale_output q2, q1, q2, q7, a9 - - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_s8_unaligned_store0 q2, a2, a12 - 3: - - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_add_rescale_output q2, q1, q2, q7, a9 - - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie728_s8_rescale_unaligned_add2d_scale_remainder - - -dl_tie728_s8_rescale_unaligned_add2d_scale_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 #input1 sar - -dl_tie728_s8_rescale_unaligned_add2d_scale_remainder: - beqz a10, dl_tie728_s8_unaligned_rescale_add2d_output_scale_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_add_rescale_output q2, q1, q2, q7, a9 - - # dl_tie728_s8_unaligned_store0 q2, a2, a12 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_rescale_add2d_output_scale_end: - retw - - -### rescaled to output by right shift -dl_tie728_s8_rescale_unaligned_add2d_output_shift: - movi a13, 1 - s8i a13, a1, 0 - EE.VLDBC.8 q7, a1 # all 1 - - blti a6, 0, dl_tie728_s8_rescale_unaligned_add2d_shift_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 4f #dl_tie728_s8_rescale_unaligned_add2d_11c_shift - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S8.QACC q5 - - EE.SRCMB.S8.QACC q5, a7, 0 - EE.VMULAS.S8.QACC q2, q7 - EE.SRCMB.S8.QACC q5, a9, 0 - - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_s8_unaligned_store0 q5, a2, a13 - 4: - addi a3, a3, -16 - add a3, a3, a10 - - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S8.QACC q5 - - EE.SRCMB.S8.QACC q5, a7, 0 - EE.VMULAS.S8.QACC q2, q7 - EE.SRCMB.S8.QACC q5, a9, 0 - - dl_tie728_s8_unaligned_store0 q5, a2, a13 - j dl_tie728_s8_rescale_unaligned_add2d_shift_remainder - - - -dl_tie728_s8_rescale_unaligned_add2d_shift_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 #input1 sar - -dl_tie728_s8_rescale_unaligned_add2d_shift_remainder: - beqz a10, dl_tie728_s8_unaligned_rescale_add2d_output_shift_end # c remainder - - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q5, a7, 0 - EE.VMULAS.S8.QACC q2, q7 - EE.SRCMB.S8.QACC q5, a9, 0 - - # dl_tie728_s8_unaligned_store0 q5, a2, a13 - dl_tie728_s8_store_remainder q5, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_rescale_add2d_output_shift_end: - retw - - - - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_add2d_11c_relu - .type dl_tie728_s8_unaligned_add2d_11c_relu, @function - # .section .iram1 -dl_tie728_s8_unaligned_add2d_11c_relu: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a10: c_remainder - # a14: activation_alpha - # a15: activation_shift - - l32i a6, a5, 64 - l32i a10, a5, 76 - l32i a7, a5, 88 - l32i a14, a5, 52 - l32i a15, a5, 60 - - bgei a7, 0, dl_tie728_s8_unaligned_rescale_add2d_11c_relu - -# input0 exp = input1 exp = output exp - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie728_s8_unaligned_add2d_11c_relu_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie728_s8_unaligned_add2d_11c_relu_0 - beqi a13, 8, dl_tie728_s8_unaligned_add2d_11c_relu_1 - - - loopgtz a6, 0f #dl_tie728_s8_unaligned_add2d_11c - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie728_s8_unaligned_add2d_11c_relu_remainder - - #output sar = 0 - dl_tie728_s8_unaligned_add2d_11c_relu_0: - loopgtz a6, 1f #dl_tie728_s8_unaligned_add2d_11c_loop0 - EE.SRC.Q.QUP q2, q0, q1 - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie728_s8_unaligned_add2d_11c_relu_remainder - - # #output sar = 8 - dl_tie728_s8_unaligned_add2d_11c_relu_1: - loopgtz a6, 2f #dl_tie728_s8_unaligned_add2d_11c_loop1 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.VADDS.S8 q2, q2, q5 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - j dl_tie728_s8_unaligned_add2d_11c_relu_remainder - -dl_tie728_s8_unaligned_add2d_11c_relu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 - -dl_tie728_s8_unaligned_add2d_11c_relu_remainder: - - beqz a10, dl_tie728_s8_unaligned_add2d_relu_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_add2d_relu_end: - - retw - - -## rescaled add -dl_tie728_s8_unaligned_rescale_add2d_11c_relu: - l32i a8, a5, 96 # output_scale - l32i a9, a5, 92 # output_shift - - beqi a8, 1, dl_tie728_s8_rescale_unaligned_add2d_output_shift_relu - - -### rescaled to output by *scale) >> shift -dl_tie728_s8_rescale_unaligned_add2d_output_scale_relu: - - s8i a8, a1, 0 - EE.VLDBC.8 q7, a1 # all output_scale - - blti a6, 0, dl_tie728_s8_rescale_unaligned_add2d_scale_relu_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 3f #dl_tie728_s8_rescale_unaligned_add2d_11c_scale - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_add_rescale_output q2, q1, q2, q7, a9 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a12 - 3: - - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_add_rescale_output q2, q1, q2, q7, a9 - - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie728_s8_rescale_unaligned_add2d_scale_relu_remainder - - -dl_tie728_s8_rescale_unaligned_add2d_scale_relu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 #input1 sar - -dl_tie728_s8_rescale_unaligned_add2d_scale_relu_remainder: - beqz a10, dl_tie728_s8_unaligned_rescale_add2d_output_scale_relu_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_add_rescale_output q2, q1, q2, q7, a9 - - EE.VRELU.S8 q2, a14, a15 - # dl_tie728_s8_unaligned_store0 q2, a2, a12 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_rescale_add2d_output_scale_relu_end: - retw - - -### rescaled to output by right shift -dl_tie728_s8_rescale_unaligned_add2d_output_shift_relu: - movi a13, 1 - s8i a13, a1, 0 - EE.VLDBC.8 q7, a1 # all 1 - - blti a6, 0, dl_tie728_s8_rescale_unaligned_add2d_shift_relu_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 4f #dl_tie728_s8_rescale_unaligned_add2d_11c_shift - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S8.QACC q5 - - EE.SRCMB.S8.QACC q5, a7, 0 - EE.VMULAS.S8.QACC q2, q7 - EE.SRCMB.S8.QACC q5, a9, 0 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q5, a14, a15 - dl_tie728_s8_unaligned_store0 q5, a2, a13 - 4: - addi a3, a3, -16 - add a3, a3, a10 - - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S8.QACC q5 - - EE.SRCMB.S8.QACC q5, a7, 0 - EE.VMULAS.S8.QACC q2, q7 - EE.SRCMB.S8.QACC q5, a9, 0 - EE.VRELU.S8 q5, a14, a15 - dl_tie728_s8_unaligned_store0 q5, a2, a13 - j dl_tie728_s8_rescale_unaligned_add2d_shift_relu_remainder - - - -dl_tie728_s8_rescale_unaligned_add2d_shift_relu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 #input1 sar - -dl_tie728_s8_rescale_unaligned_add2d_shift_relu_remainder: - beqz a10, dl_tie728_s8_unaligned_rescale_add2d_output_shift_relu_end # c remainder - - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q5, a7, 0 - EE.VMULAS.S8.QACC q2, q7 - EE.SRCMB.S8.QACC q5, a9, 0 - EE.VRELU.S8 q5, a14, a15 - # dl_tie728_s8_unaligned_store0 q5, a2, a13 - dl_tie728_s8_store_remainder q5, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_rescale_add2d_output_shift_relu_end: - retw - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_add2d_11c_prelu - .type dl_tie728_s8_unaligned_add2d_11c_prelu, @function - # .section .iram1 -dl_tie728_s8_unaligned_add2d_11c_prelu: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a10: c_remainder - # a14: activation_alpha_ptr - # a15: activation_shift - - l32i a6, a5, 64 - l32i a10, a5, 76 - l32i a7, a5, 88 - l32i a14, a5, 56 - l32i a15, a5, 60 - - bgei a7, 0, dl_tie728_s8_unaligned_rescale_add2d_11c_prelu - -# input0 exp = input1 exp = output exp - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie728_s8_unaligned_add2d_11c_prelu_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie728_s8_unaligned_add2d_11c_prelu_0 - beqi a13, 8, dl_tie728_s8_unaligned_add2d_11c_prelu_1 - - - loopgtz a6, 0f #dl_tie728_s8_unaligned_add2d_11c - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie728_s8_unaligned_add2d_11c_prelu_remainder - - #output sar = 0 - dl_tie728_s8_unaligned_add2d_11c_prelu_0: - loopgtz a6, 1f #dl_tie728_s8_unaligned_add2d_11c_loop0 - EE.SRC.Q.QUP q2, q0, q1 - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VADDS.S8 q2, q2, q5 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie728_s8_unaligned_add2d_11c_prelu_remainder - - # #output sar = 8 - dl_tie728_s8_unaligned_add2d_11c_prelu_1: - loopgtz a6, 2f #dl_tie728_s8_unaligned_add2d_11c_loop1 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.VADDS.S8 q2, q2, q5 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - j dl_tie728_s8_unaligned_add2d_11c_prelu_remainder - -dl_tie728_s8_unaligned_add2d_11c_prelu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 - -dl_tie728_s8_unaligned_add2d_11c_prelu_remainder: - - beqz a10, dl_tie728_s8_unaligned_add2d_prelu_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VADDS.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_add2d_prelu_end: - - retw - - -## rescaled add -dl_tie728_s8_unaligned_rescale_add2d_11c_prelu: - l32i a8, a5, 96 # output_scale - l32i a9, a5, 92 # output_shift - - beqi a8, 1, dl_tie728_s8_rescale_unaligned_add2d_output_shift_prelu - - -### rescaled to output by *scale) >> shift -dl_tie728_s8_rescale_unaligned_add2d_output_scale_prelu: - - s8i a8, a1, 0 - EE.VLDBC.8 q7, a1 # all output_scale - - blti a6, 0, dl_tie728_s8_rescale_unaligned_add2d_scale_prelu_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 3f #dl_tie728_s8_rescale_unaligned_add2d_11c_scale - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_add_rescale_output q2, q1, q2, q7, a9 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a12 - 3: - - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.VLD.128.IP q6, a14, 16 - dl_tie728_rescale_add_rescale_output q2, q1, q2, q7, a9 - - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie728_s8_rescale_unaligned_add2d_scale_prelu_remainder - - -dl_tie728_s8_rescale_unaligned_add2d_scale_prelu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 #input1 sar - -dl_tie728_s8_rescale_unaligned_add2d_scale_prelu_remainder: - beqz a10, dl_tie728_s8_unaligned_rescale_add2d_output_scale_prelu_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.VLD.128.IP q6, a14, 16 - dl_tie728_rescale_add_rescale_output q2, q1, q2, q7, a9 - - EE.VPRELU.S8 q2, q2, q6, a15 - # dl_tie728_s8_unaligned_store0 q2, a2, a12 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_rescale_add2d_output_scale_prelu_end: - retw - - -### rescaled to output by right shift -dl_tie728_s8_rescale_unaligned_add2d_output_shift_prelu: - movi a13, 1 - s8i a13, a1, 0 - EE.VLDBC.8 q7, a1 # all 1 - - blti a6, 0, dl_tie728_s8_rescale_unaligned_add2d_shift_prelu_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 4f #dl_tie728_s8_rescale_unaligned_add2d_11c_shift - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S8.QACC q5 - - EE.SRCMB.S8.QACC q5, a7, 0 - EE.VMULAS.S8.QACC q2, q7 - EE.SRCMB.S8.QACC q5, a9, 0 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q5, q5, q6, a15 - dl_tie728_s8_unaligned_store0 q5, a2, a13 - 4: - addi a3, a3, -16 - add a3, a3, a10 - - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S8.QACC q5 - - EE.SRCMB.S8.QACC q5, a7, 0 - EE.VMULAS.S8.QACC q2, q7 - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S8.QACC q5, a9, 0 - EE.VPRELU.S8 q5, q5, q6, a15 - dl_tie728_s8_unaligned_store0 q5, a2, a13 - j dl_tie728_s8_rescale_unaligned_add2d_shift_prelu_remainder - - - -dl_tie728_s8_rescale_unaligned_add2d_shift_prelu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 #input0 sar - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 #input1 sar - -dl_tie728_s8_rescale_unaligned_add2d_shift_prelu_remainder: - beqz a10, dl_tie728_s8_unaligned_rescale_add2d_output_shift_prelu_end # c remainder - - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q5, a7, 0 - EE.VMULAS.S8.QACC q2, q7 - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S8.QACC q5, a9, 0 - EE.VPRELU.S8 q5, q5, q6, a15 - # dl_tie728_s8_unaligned_store0 q5, a2, a13 - dl_tie728_s8_store_remainder q5, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_rescale_add2d_output_shift_prelu_end: - retw - diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_avg_pool2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_avg_pool2d.S deleted file mode 100644 index 62b97514..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_avg_pool2d.S +++ /dev/null @@ -1,551 +0,0 @@ -############################################################################################################################################################ -#### -#### dl_tie728_s8_avg_pool2d series -#### -############################################################################################################################################################ - -#include "dl_tie728_s8.S" - - - .align 4 - .text - .global dl_tie728_s8_avg_pool2d_22c1 - .type dl_tie728_s8_avg_pool2d_22c1, @function - .section .iram1 -dl_tie728_s8_avg_pool2d_22c1: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a10, a4, 4 # input_channel - l32i a11, a4, 104 # c_div_x_1 - l32i a13, a4, 56 # shift - - addi a14, a4, 64 - EE.VLDBC.8 q0, a14 # avg_pool_area_inv - - add a7, a3, a6 - add a8, a3, a5 - add a9, a8, a6 - - EE.VLD.128.IP q1, a3, 16 - EE.VLD.128.IP q2, a7, 16 - loopgtz a11, 0f - EE.ZERO.QACC - EE.VMULAS.S8.QACC.LD.IP q3, a8, 16, q0, q1 - EE.VMULAS.S8.QACC.LD.IP q4, a9, 16, q0, q2 - EE.VMULAS.S8.QACC.LD.IP q1, a3, 16, q0, q3 - EE.VMULAS.S8.QACC.LD.IP q2, a7, 16, q0, q4 - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a15, q6 - EE.VST.128.IP q7, a2, 16 - 0: - - EE.ZERO.QACC - EE.VMULAS.S8.QACC.LD.IP q3, a8, 16, q0, q1 - EE.VMULAS.S8.QACC.LD.IP q4, a9, 16, q0, q2 - EE.VMULAS.S8.QACC.LD.IP q1, a3, 16, q0, q3 - EE.VMULAS.S8.QACC.LD.IP q2, a7, 16, q0, q4 - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a15, q6 - - EE.VST.128.IP q7, a2, 16 - retw - - - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_avg_pool2d_22c1 - .type dl_tie728_s8_unaligned_avg_pool2d_22c1, @function - .section .iram1 -dl_tie728_s8_unaligned_avg_pool2d_22c1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a10, a4, 4 # input_channel - l32i a11, a4, 104 # c_div_x_1 - l32i a12, a4, 60 # c_remainder - l32i a13, a4, 56 # shift - - addi a14, a4, 64 - EE.VLDBC.8 q6, a14 # avg_pool_area_inv - - add a7, a3, a6 - add a8, a3, a5 - add a9, a8, a6 - - blti a11, 0, dl_tie728_s8_unaligned_avg_pool2d_22c1_remainder #channel < 16 - - EE.LD.128.USAR.IP q7, a2, 0 #get output_ptr sar_byte - rur.sar_byte a15 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q1, a3, 0 - - beqi a15, 0, 1f - beqi a15, 8, 2f - - loopgtz a11, 0f - EE.ZERO.QACC - EE.SRC.Q.LD.IP q2, a7, 16, q0, q1 - - EE.LD.128.USAR.IP q3, a7, 0 - EE.VMULAS.S8.QACC q6, q0 - EE.SRC.Q.LD.IP q4, a8, 16, q2, q3 - - EE.LD.128.USAR.IP q5, a8, 0 - EE.VMULAS.S8.QACC q6, q2 - EE.SRC.Q.LD.IP q2, a9, 16, q4, q5 - - EE.LD.128.USAR.IP q3, a9, 0 - EE.VMULAS.S8.QACC q6, q4 - EE.SRC.Q.LD.IP q0, a3, 16, q2, q3 - - EE.LD.128.USAR.IP q1, a3, 0 - EE.VMULAS.S8.QACC q6, q2 - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a15, q5 - dl_tie728_s8_unaligned_store0 q7, a2, a14 - 0: - j dl_tie728_s8_unaligned_avg_pool2d_22c1_loop_end - - -1: - loopgtz a11, 0f - EE.ZERO.QACC - EE.SRC.Q.LD.IP q2, a7, 16, q0, q1 - - EE.LD.128.USAR.IP q3, a7, 0 - EE.VMULAS.S8.QACC q6, q0 - EE.SRC.Q.LD.IP q4, a8, 16, q2, q3 - - EE.LD.128.USAR.IP q5, a8, 0 - EE.VMULAS.S8.QACC q6, q2 - EE.SRC.Q.LD.IP q2, a9, 16, q4, q5 - - EE.LD.128.USAR.IP q3, a9, 0 - EE.VMULAS.S8.QACC q6, q4 - EE.SRC.Q.LD.IP q0, a3, 16, q2, q3 - - EE.LD.128.USAR.IP q1, a3, 0 - EE.VMULAS.S8.QACC q6, q2 - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a15, q5 - EE.VST.128.IP q7, a2, 16 - 0: - j dl_tie728_s8_unaligned_avg_pool2d_22c1_loop_end - -2: - loopgtz a11, 0f - EE.ZERO.QACC - EE.SRC.Q.LD.IP q2, a7, 16, q0, q1 - - EE.LD.128.USAR.IP q3, a7, 0 - EE.VMULAS.S8.QACC q6, q0 - EE.SRC.Q.LD.IP q4, a8, 16, q2, q3 - - EE.LD.128.USAR.IP q5, a8, 0 - EE.VMULAS.S8.QACC q6, q2 - EE.SRC.Q.LD.IP q2, a9, 16, q4, q5 - - EE.LD.128.USAR.IP q3, a9, 0 - EE.VMULAS.S8.QACC q6, q4 - EE.SRC.Q.LD.IP q0, a3, 16, q2, q3 - - EE.LD.128.USAR.IP q1, a3, 0 - EE.VMULAS.S8.QACC q6, q2 - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a15, q5 - dl_tie728_s8_unaligned_store1 q7, a2 - 0: - - -dl_tie728_s8_unaligned_avg_pool2d_22c1_loop_end: - EE.ZERO.QACC - EE.SRC.Q.LD.IP q2, a7, 16, q0, q1 - - EE.LD.128.USAR.IP q3, a7, 0 - EE.VMULAS.S8.QACC q6, q0 - EE.SRC.Q.LD.IP q4, a8, 16, q2, q3 - - EE.LD.128.USAR.IP q5, a8, 0 - EE.VMULAS.S8.QACC q6, q2 - EE.SRC.Q.LD.IP q2, a9, 16, q4, q5 - - EE.LD.128.USAR.IP q3, a9, 0 - EE.VMULAS.S8.QACC q6, q4 - EE.SRC.Q q2, q2, q3 - EE.VMULAS.S8.QACC q6, q2 - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a15, q5 - dl_tie728_s8_unaligned_store0 q7, a2, a14 - - beqz a12, dl_tie728_s8_unaligned_avg_pool2d_22c1_end - -dl_tie728_s8_unaligned_avg_pool2d_22c1_remainder: - EE.LD.128.USAR.XP q0, a3, a12 - EE.VLD.128.IP q1, a3, 0 - EE.ZERO.QACC - EE.SRC.Q q0, q0, q1 - - EE.LD.128.USAR.XP q2, a7, a12 - EE.VLD.128.IP q3, a7, 0 - EE.VMULAS.S8.QACC q6, q0 - EE.SRC.Q q2, q2, q3 - - EE.LD.128.USAR.XP q4, a8, a12 - EE.VLD.128.IP q5, a8, 0 - EE.VMULAS.S8.QACC q6, q2 - EE.SRC.Q q4, q4, q5 - - EE.LD.128.USAR.XP q2, a9, a12 - EE.VLD.128.IP q3, a9, 0 - EE.VMULAS.S8.QACC q6, q4 - EE.SRC.Q q2, q2, q3 - - EE.VMULAS.S8.QACC q6, q2 - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a15, q5 - - dl_tie728_s8_store_remainder q7, a8, a9, a10, a11, a2, a12 - -dl_tie728_s8_unaligned_avg_pool2d_22c1_end: - - retw - - - - - .align 4 - .text - .global dl_tie728_s8_avg_pool2d_hwc1 - .type dl_tie728_s8_avg_pool2d_hwc1, @function - .section .iram1 -dl_tie728_s8_avg_pool2d_hwc1: - .align 4 - entry sp, 16 - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a7, a4, 4 # input_channel - l32i a8, a4, 48 # filter_height - l32i a9, a4, 52 # filter_width - l32i a11, a4, 104 # c_div_x_1 - l32i a13, a4, 56 # shift - - addi a14, a4, 64 - EE.VLDBC.8 q0, a14 # avg_pool_area_inv - - srli a10, a9, 1 - addi a10, a10, -1 # filter_w / 2 - 1 - - beqi a9, 1, dl_tie728_s8_avg_pool2d_h1c1 #filter_width == 1 - blti a11, 1, dl_tie728_s8_avg_pool2d_hw_small_channel - - 5: - mov a7, a3 - mov a14, a7 - mov a15, a8 - EE.ZERO.QACC - 4: - EE.VLD.128.XP q1, a14, a6 - EE.VLD.128.XP q2, a14, a6 - loopgtz a10, 0f - EE.VMULAS.S8.QACC.LD.XP q1, a14, a6, q0, q1 - EE.VMULAS.S8.QACC.LD.XP q2, a14, a6, q0, q2 - 0: - bbci a9, 0, 2f - 1:#three left - EE.VMULAS.S8.QACC.LD.XP q1, a14, a6, q0, q1 - EE.VMULAS.S8.QACC q0, q2 - EE.VMULAS.S8.QACC q0, q1 - j 3f - - 2: # two left - EE.VMULAS.S8.QACC q0, q1 - EE.VMULAS.S8.QACC q0, q2 - 3: - addi a15, a15, -1 - add a7, a7, a5 - mov a14, a7 - bnez a15, 4b - - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a15, q5 - EE.VST.128.IP q7, a2, 16 - addi a3, a3, 16 - addi a11, a11, -1 - bnez a11, 5b - -dl_tie728_s8_avg_pool2d_hw_small_channel: - mov a7, a3 - mov a14, a7 - mov a15, a8 - EE.ZERO.QACC - 4: - EE.VLD.128.XP q1, a14, a6 - EE.VLD.128.XP q2, a14, a6 - loopgtz a10, 0f - EE.VMULAS.S8.QACC.LD.XP q1, a14, a6, q0, q1 - EE.VMULAS.S8.QACC.LD.XP q2, a14, a6, q0, q2 - 0: - bbci a9, 0, 2f - 1:#three left - EE.VMULAS.S8.QACC.LD.XP q1, a14, a6, q0, q1 - EE.VMULAS.S8.QACC q0, q2 - EE.VMULAS.S8.QACC q0, q1 - j 3f - - 2: # two left - EE.VMULAS.S8.QACC q0, q1 - EE.VMULAS.S8.QACC q0, q2 - 3: - addi a15, a15, -1 - add a7, a7, a5 - mov a14, a7 - bnez a15, 4b - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a15, q5 - - EE.VST.128.IP q7, a2, 16 - retw - -dl_tie728_s8_avg_pool2d_h1c1: - addi a8, a8, -1 - blti a11, 1, dl_tie728_s8_max_pool2d_h1_small_channel - 1: - mov a14, a3 - EE.ZERO.QACC - EE.VLD.128.XP q1, a14, a5 - loopgtz a8, 0f - EE.VMULAS.S8.QACC.LD.XP q1, a14, a5, q0, q1 - 0: - EE.VMULAS.S8.QACC q0, q1 - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a15, q5 - EE.VST.128.IP q7, a2, 16 - addi a3, a3, 16 - addi a11, a11, -1 - bnez a11, 1b - -dl_tie728_s8_max_pool2d_h1_small_channel: - mov a14, a3 - EE.ZERO.QACC - EE.VLD.128.XP q1, a14, a5 - loopgtz a8, 0f - EE.VMULAS.S8.QACC.LD.XP q1, a14, a5, q0, q1 - 0: - EE.VMULAS.S8.QACC q0, q1 - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a15, q5 - - EE.VST.128.IP q7, a2, 16 - retw - - - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_avg_pool2d_hwc1 - .type dl_tie728_s8_unaligned_avg_pool2d_hwc1, @function - .section .iram1 -dl_tie728_s8_unaligned_avg_pool2d_hwc1: - .align 4 - entry sp, 16 - - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a7, a4, 4 # input_channel - l32i a8, a4, 48 # filter_height - l32i a9, a4, 52 # filter_width - l32i a11, a4, 104 # c_div_x_1 - l32i a12, a4, 60 # c_remainder - l32i a13, a4, 56 # shift - - addi a14, a4, 64 - EE.VLDBC.8 q6, a14 # avg_pool_area_inv - - srli a10, a9, 1 - addi a10, a10, -1 # filter_w / 2 - 1 - - addi a6, a6, -16 - - EE.LD.128.USAR.IP q7, a2, 0 #get output_ptr sar_byte - rur.sar_byte a15 - - addi a11, a11, 1 - - beqi a9, 1, dl_tie728_s8_unaligned_avg_pool2d_h1c1 #filter_width == 1 - blti a11, 1, dl_tie728_s8_unaligned_avg_pool2d_hw_small_channel - - - 9: - mov a7, a3 - mov a14, a7 - mov a12, a8 - EE.ZERO.QACC - 4: - EE.LD.128.USAR.IP q0, a14, 16 - EE.LD.128.USAR.XP q1, a14, a6 - loopgtz a10, 0f - EE.SRC.Q.LD.IP q2, a14, 16, q0, q1 - EE.LD.128.USAR.XP q1, a14, a6 - EE.VMULAS.S8.QACC q6, q0 - - EE.SRC.Q.LD.IP q0, a14, 16, q2, q1 - EE.LD.128.USAR.XP q1, a14, a6 - EE.VMULAS.S8.QACC q6, q2 - 0: - - bbci a9, 0, 2f - 1:#three left - EE.SRC.Q.LD.IP q2, a14, 16, q0, q1 - EE.LD.128.USAR.XP q1, a14, a6 - EE.VMULAS.S8.QACC q6, q0 - - EE.SRC.Q.LD.IP q0, a14, 16, q2, q1 - EE.LD.128.USAR.XP q1, a14, a6 - EE.VMULAS.S8.QACC q6, q2 - - EE.SRC.Q q0, q0, q1 - EE.VMULAS.S8.QACC q6, q0 - - j 3f - - 2:# two left - EE.SRC.Q.LD.IP q2, a14, 16, q0, q1 - EE.LD.128.USAR.XP q1, a14, a6 - EE.VMULAS.S8.QACC q6, q0 - - EE.SRC.Q q2, q2, q1 - EE.VMULAS.S8.QACC q6, q2 - - 3: - addi a12, a12, -1 - add a7, a7, a5 - mov a14, a7 - bnez a12, 4b - - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a14, q5 - - beqi a15, 0, 5f - beqi a15, 8, 6f - - dl_tie728_s8_unaligned_store0 q7, a2, a14 - j 7f - - 5: - EE.VST.128.IP q7, a2, 16 - j 7f - 6: - dl_tie728_s8_unaligned_store1 q7, a2 - - 7: - addi a3, a3, 16 - addi a11, a11, -1 - bnez a11, 9b - -dl_tie728_s8_unaligned_avg_pool2d_hw_small_channel: - l32i a12, a4, 60 # c_remainder - beqz a12, dl_tie728_s8_unaligned_avg_pool2d_hw_small_channel_end - - mov a7, a3 - mov a14, a7 - mov a15, a8 - addi a6, a6, 16 - sub a6, a6, a12 - - EE.ZERO.QACC - 1: - loopgtz a9, 0f - EE.LD.128.USAR.XP q0, a14, a12 - EE.VLD.128.XP q1, a14, a6 - EE.SRC.Q q0, q0, q1 - EE.VMULAS.S8.QACC q6, q0 - 0: - addi a15, a15, -1 - add a7, a7, a5 - mov a14, a7 - bnez a15, 1b - - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a14, q5 - dl_tie728_s8_store_remainder q7, a8, a9, a10, a11, a2, a12 - -dl_tie728_s8_unaligned_avg_pool2d_hw_small_channel_end: - retw - -dl_tie728_s8_unaligned_avg_pool2d_h1c1: - addi a5, a5, -16 - blti a11, 1, dl_tie728_s8_unaligned_avg_pool2d_h1_remainder - - 5: - mov a14, a3 - EE.ZERO.QACC - loopgtz a8, 0f - EE.LD.128.USAR.IP q0, a14, 16 - EE.VLD.128.XP q1, a14, a5 - EE.SRC.Q q0, q0, q1 - EE.VMULAS.S8.QACC q6, q0 - 0: - - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a14, q5 - beqi a15, 0, 1f - beqi a15, 8, 2f - - dl_tie728_s8_unaligned_store0 q7, a2, a9 - j 3f - 1: - EE.VST.128.IP q7, a2, 16 - j 3f - 2: - dl_tie728_s8_unaligned_store1 q7, a2 - - 3: - addi a3, a3, 16 - addi a11, a11, -1 - bnez a11, 5b - -dl_tie728_s8_unaligned_avg_pool2d_h1_remainder: - beqz a12, dl_tie728_s8_unaligned_avg_pool2d_hwc1_end - - mov a14, a3 - addi a5, a5, 16 - sub a5, a5, a12 - EE.ZERO.QACC - loopgtz a8, 0f - EE.LD.128.USAR.XP q0, a14, a12 - EE.VLD.128.XP q1, a14, a5 - EE.SRC.Q q0, q0, q1 - EE.VMULAS.S8.QACC q6, q0 - 0: - - # EE.SRCMB.S8.QACC q7, a13, 0 - tie728_s8_vector_round_result q7, a13, a14, q5 - dl_tie728_s8_store_remainder q7, a8, a9, a10, a11, a2, a12 - -dl_tie728_s8_unaligned_avg_pool2d_hwc1_end: - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_conv2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_conv2d.S deleted file mode 100644 index d54fed79..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_conv2d.S +++ /dev/null @@ -1,2598 +0,0 @@ -#include "dl_tie728_s8.S" - -############################################################################################################################################################ -#### -#### tie728_s8_conv2d_11cn series -#### -############################################################################################################################################################ -.macro tie728_s8_conv2d_11c16 input_v0 input_ptr filter_v0 filter_v1 filter_ptr c_div_x_1 - # scalar * vecter and accumulate into QACC - # input_ptr += (c_div_x_1 + 1) * 16 in the end - # filter_ptr point to the next 16 bytes in the end - - # input_v0: 16 input elements - # filter_v0: 16 filter elements - # filter_v1: 16 filter elements - # input_ptr: input_ptr - # filter_ptr: filter_ptr - # c_div_x_1: input_channel // 16 - 1 - - EE.VLD.128.IP \input_v0, \input_ptr, 16 - EE.VLD.128.IP \filter_v0, \filter_ptr, 16 - EE.VLD.128.IP \filter_v1, \filter_ptr, 16 - - loopgtz \c_div_x_1, 0f - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 11 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 12 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 13 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 14 - EE.VSMULAS.S8.QACC.LD.INCP \input_v0, \input_ptr, \filter_v1, \input_v0, 15 - EE.VLD.128.IP \filter_v1, \filter_ptr, 16 -0: - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 11 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 12 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 13 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 14 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 15 -.endm - - -############################################################################################################################################################ -#### -#### tie728_s8_conv2d_11cn -#### -############################################################################################################################################################ -.macro tie728_s8_conv2d_11cn_load_args args filter_ptr c_div_x_1 n_rs3 - l32i \n_rs3, \args, 96 // output_channel_div_8 - l32i \filter_ptr, \args, 48 // filter - l32i \c_div_x_1, \args, 100 // input_channel / x - 1 -.endm - - - - .align 4 - .text - .global dl_tie728_s8_conv2d_11cn - .type dl_tie728_s8_conv2d_11cn, @function - # .section .iram1 -dl_tie728_s8_conv2d_11cn: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: - # a10: - # a11: bias_ptr - # a12: - # a13: - # a14: - # a15: moving_input_ptr - tie728_s8_conv2d_11cn_load_args a4, a5, a6, a7 - - l32i a11, a4, 68 # bias - l32i a8, a4, 64 # mac shift - blti a8, 0, dl_tie728_s8_conv2d_per_channel_11cn - - -dl_tie728_s8_conv2d_per_layer_11cn: - - beqz a11, tie728_s8_conv2d_per_layer_11cn_no_bias_loop - - tie728_s8_conv2d_per_layer_11cn_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_11c16 q0, a15, q1, q2, a5, a6 - tie728_s8_vector_round_result q0, a8, a15, q3 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_11cn_bias_loop - retw - - - tie728_s8_conv2d_per_layer_11cn_no_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_11c16 q0, a15, q1, q2, a5, a6 - tie728_s8_vector_round_result q0, a8, a15, q3 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_11cn_no_bias_loop - retw - - -dl_tie728_s8_conv2d_per_channel_11cn: - l32i a8, a4, 104 # filter_channel_factor address - - beqz a11, tie728_s8_conv2d_per_channel_11cn_no_bias_loop - - tie728_s8_conv2d_per_channel_11cn_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_11c16 q0, a15, q1, q2, a5, a6 - # tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - # tie728_s8_conv2d_bias q0, q1, a11 - tie728_s8_conv2d_per_channel_with_bias_result q0, q1, a8, a11, a15, q2 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_11cn_bias_loop - retw - - - tie728_s8_conv2d_per_channel_11cn_no_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_11c16 q0, a15, q1, q2, a5, a6 - tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_11cn_no_bias_loop - retw - - - - .align 4 - .text - .global dl_tie728_s8_conv2d_11cn_relu - .type dl_tie728_s8_conv2d_11cn_relu, @function - # .section .iram1 -dl_tie728_s8_conv2d_11cn_relu: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: - # a10: - # a11: bias_ptr - # a12: activation_alpha - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s8_conv2d_11cn_load_args a4, a5, a6, a7 - - l32i a12, a4, 76 # activation_alpha - l32i a13, a4, 84 # activation_shift - - l32i a11, a4, 68 # bias - l32i a8, a4, 64 # mac shift - blti a8, 0, dl_tie728_s8_conv2d_per_channel_11cn_relu - - -dl_tie728_s8_conv2d_per_layer_11cn_relu: - - beqz a11, tie728_s8_conv2d_per_layer_11cn_no_bias_relu_loop - - tie728_s8_conv2d_per_layer_11cn_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_11c16 q0, a15, q1, q2, a5, a6 - tie728_s8_vector_round_result q0, a8, a15, q3 - tie728_s8_conv2d_relu q0, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_11cn_bias_relu_loop - retw - - - tie728_s8_conv2d_per_layer_11cn_no_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_11c16 q0, a15, q1, q2, a5, a6 - tie728_s8_vector_round_result q0, a8, a15, q3 - tie728_s8_conv2d_relu q0, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_11cn_no_bias_relu_loop - retw - - -dl_tie728_s8_conv2d_per_channel_11cn_relu: - l32i a8, a4, 104 # filter_channel_factor address - - beqz a11, tie728_s8_conv2d_per_channel_11cn_no_bias_relu_loop - - tie728_s8_conv2d_per_channel_11cn_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_11c16 q0, a15, q1, q2, a5, a6 - # tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - # tie728_s8_conv2d_bias_relu q0, q1, a11, a12, a13 - tie728_s8_conv2d_per_channel_with_bias_result q0, q1, a8, a11, a15, q2 - tie728_s8_conv2d_relu q0, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_11cn_bias_relu_loop - retw - - - tie728_s8_conv2d_per_channel_11cn_no_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_11c16 q0, a15, q1, q2, a5, a6 - tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - tie728_s8_conv2d_relu q0, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_11cn_no_bias_relu_loop - retw - - - - .align 4 - .text - .global dl_tie728_s8_conv2d_11cn_prelu - .type dl_tie728_s8_conv2d_11cn_prelu, @function - # .section .iram1 -dl_tie728_s8_conv2d_11cn_prelu: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: - # a10: - # a11: bias_ptr - # a12: activation_alpha - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s8_conv2d_11cn_load_args a4, a5, a6, a7 - - l32i a12, a4, 80 # activation_alpha_ptr - l32i a13, a4, 84 # activation_shift - - l32i a11, a4, 68 # bias - l32i a8, a4, 64 # mac shift - blti a8, 0, dl_tie728_s8_conv2d_per_channel_11cn_prelu - - -dl_tie728_s8_conv2d_per_layer_11cn_prelu: - - beqz a11, tie728_s8_conv2d_per_layer_11cn_no_bias_prelu_loop - - tie728_s8_conv2d_per_layer_11cn_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_11c16 q0, a15, q1, q2, a5, a6 - tie728_s8_vector_round_result q0, a8, a15, q3 - tie728_s8_conv2d_prelu q0, q2, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_11cn_bias_prelu_loop - retw - - - tie728_s8_conv2d_per_layer_11cn_no_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_11c16 q0, a15, q1, q2, a5, a6 - tie728_s8_vector_round_result q0, a8, a15, q3 - tie728_s8_conv2d_prelu q0, q2, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_11cn_no_bias_prelu_loop - retw - - -dl_tie728_s8_conv2d_per_channel_11cn_prelu: - l32i a8, a4, 104 # filter_channel_factor address - - beqz a11, tie728_s8_conv2d_per_channel_11cn_no_bias_prelu_loop - - tie728_s8_conv2d_per_channel_11cn_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_11c16 q0, a15, q1, q2, a5, a6 - # tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - # tie728_s8_conv2d_bias_prelu q0, q1, a11, q2, a12, a13 - tie728_s8_conv2d_per_channel_with_bias_result q0, q1, a8, a11, a15, q2 - tie728_s8_conv2d_prelu q0, q2, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_11cn_bias_prelu_loop - retw - - - tie728_s8_conv2d_per_channel_11cn_no_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_11c16 q0, a15, q1, q2, a5, a6 - tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - tie728_s8_conv2d_prelu q0, q2, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_11cn_no_bias_prelu_loop - retw - - - - - - - -############################################################################################################################################################ -#### -#### tie728_s8_unaligned_conv2d_11cn -#### -############################################################################################################################################################ - -.macro tie728_s8_conv2d_1_1_n_remainder_result operation_type output output_ptr mac_shift bias_ptr activation_alpha activation_shift tmp tmp_q1 - #l16si a7, a1, 0 - EE.MOVI.32.A \operation_type, \tmp, 0 # operation type in q7[0] - - bbci \tmp, 3, 7f - bbci \tmp, 2, 11f -11: # per_channel bias + prelu, 0x1011 - bbci \tmp, 1, 9f - bbci \tmp, 0, 10f - - # l16si \tmp, \mac_shift, 0 - # addi \mac_shift, \mac_shift, 2 - # EE.SRS.ACCX \output, \tmp, 0 - - # # add bias - # l8ui \tmp, \bias_ptr, 0 # load bias - # addi \bias_ptr, \bias_ptr, 1 - # sext \tmp, \tmp, 7 - # add \output, \output, \tmp - - movi \tmp, 4 - EE.SRS.ACCX \output, \tmp, 0 - - l16si \tmp, \bias_ptr, 0 # load bias - addi \bias_ptr, \bias_ptr, 2 - - add \output, \output, \tmp # add bias - - l16si \tmp, \mac_shift, 0 - addi \mac_shift, \mac_shift, 2 - addi \tmp, \tmp, -4 - - ssr \tmp # mac_shift-4 - sra \output, \output - - # prelu - bgez \output, 17f - - l8ui \tmp, \activation_alpha, 0 # load PReLU alpha - sext \tmp, \tmp, 7 - mull \output, \output, \tmp - ssr \activation_shift # activation_shift - sra \output, \output - - j 17f # jump to 17f -10: # per_channel bias + relu, 0x1010 - # l16si \tmp, \mac_shift, 0 - # addi \mac_shift, \mac_shift, 2 - # EE.SRS.ACCX \output, \tmp, 0 - - # # add bias - # l8ui \tmp, \bias_ptr, 0 # load bias - # addi \bias_ptr, \bias_ptr, 1 - # sext \tmp, \tmp, 7 - # add \output, \output, \tmp - - movi \tmp, 4 - EE.SRS.ACCX \output, \tmp, 0 - - l16si \tmp, \bias_ptr, 0 # load bias - addi \bias_ptr, \bias_ptr, 2 - - add \output, \output, \tmp # add bias - - l16si \tmp, \mac_shift, 0 - addi \mac_shift, \mac_shift, 2 - addi \tmp, \tmp, -4 - - ssr \tmp # mac_shift-4 - sra \output, \output - - # Relu or LeakyRelu - bgez \output, 16f - mull \output, \output, \activation_alpha - ssr \activation_shift # activation_shift - sra \output, \output - - j 16f # jump to 16f - -9: # per_channel bias, 0x1001 - bbci \tmp, 0, 8f - # l16si \tmp, \mac_shift, 0 - # addi \mac_shift, \mac_shift, 2 - # EE.SRS.ACCX \output, \tmp, 0 - - # # add bias - # l8ui \tmp, \bias_ptr, 0 # load bias - # addi \bias_ptr, \bias_ptr, 1 - # sext \tmp, \tmp, 7 - # add \output, \output, \tmp - movi \tmp, 4 - EE.SRS.ACCX \output, \tmp, 0 - - l16si \tmp, \bias_ptr, 0 # load bias - addi \bias_ptr, \bias_ptr, 2 - - add \output, \output, \tmp # add bias - - l16si \tmp, \mac_shift, 0 - addi \mac_shift, \mac_shift, 2 - addi \tmp, \tmp, -4 - - ssr \tmp # mac_shift-4 - sra \output, \output - - j 16f # jump to 16f -8: # per_channel no_bias + prelu, 0x1000 - l16si \tmp, \mac_shift, 0 - addi \mac_shift, \mac_shift, 2 - EE.SRS.ACCX \output, \tmp, 0 - - # prelu - bgez \output, 17f - - l8ui \tmp, \activation_alpha, 0 # load PReLU alpha - sext \tmp, \tmp, 7 - mull \output, \output, \tmp - ssr \activation_shift # activation_shift - sra \output, \output - - j 17f # jump to 16f - -7: # per_channel no_bias + relu, 0x111 - bbci \tmp, 2, 3f - bbci \tmp, 1, 5f - bbci \tmp, 0, 6f - - l16si \tmp, \mac_shift, 0 - - addi \mac_shift, \mac_shift, 2 - EE.SRS.ACCX \output, \tmp, 0 - - # Relu or LeakyRelu - bgez \output, 16f - mull \output, \output, \activation_alpha - ssr \activation_shift # activation_shift - sra \output, \output - - j 16f # jump to 16f - -6: # per_channel no_bias, 0x110 - l16si \tmp, \mac_shift, 0 - addi \mac_shift, \mac_shift, 2 - EE.SRS.ACCX \output, \tmp, 0 - - - j 16f # jump to 16f - -5: # remainder == 4, 5 - bbci \tmp, 0, 4f - # per_layer bias + prelu, 0x101 - # EE.SRS.ACCX \output, \mac_shift, 0 - tie728_s8_element_round_result \output, \mac_shift, \tmp, \tmp_q1 - - # bias will be preload - # # add bias - # l8ui \tmp, \bias_ptr, 0 # load bias - # addi \bias_ptr, \bias_ptr, 1 - # sext \tmp, \tmp, 7 - # add \output, \output, \tmp - - # prelu - bgez \output, 17f - - l8ui \tmp, \activation_alpha, 0 # load PReLU alpha - sext \tmp, \tmp, 7 - mull \output, \output, \tmp - ssr \activation_shift # activation_shift - sra \output, \output - - j 17f # jump to 17f - -4: # per_layer bias + relu, 0x100 - # EE.SRS.ACCX \output, \mac_shift, 0 - tie728_s8_element_round_result \output, \mac_shift, \tmp, \tmp_q1 - - # bias will be preload - # # add bias - # l8ui \tmp, \bias_ptr, 0 # load bias - # addi \bias_ptr, \bias_ptr, 1 - # sext \tmp, \tmp, 7 - # add \output, \output, \tmp - - # Relu or LeakyRelu - bgez \output, 16f - mull \output, \output, \activation_alpha - ssr \activation_shift # activation_shift - sra \output, \output - - - j 16f # jump to 16f - -3: # remainder == 1, 2, 3 - bbci \tmp, 1, 1f - bbci \tmp, 0, 2f - # per_layer bias, 0x011 - # EE.SRS.ACCX \output, \mac_shift, 0 - tie728_s8_element_round_result \output, \mac_shift, \tmp, \tmp_q1 - - # bias will be preload - # # add bias - # l8ui \tmp, \bias_ptr, 0 # load bias - # addi \bias_ptr, \bias_ptr, 1 - # sext \tmp, \tmp, 7 - # add \output, \output, \tmp - - j 16f # jump to 16f - -2: # per_layer no_bias + prelu, 0x010 - # EE.SRS.ACCX \output, \mac_shift, 0 - tie728_s8_element_round_result \output, \mac_shift, \tmp, \tmp_q1 - - # prelu - bgez \output, 17f - - l8ui \tmp, \activation_alpha, 0 # load PReLU alpha - sext \tmp, \tmp, 7 - mull \output, \output, \tmp - ssr \activation_shift # activation_shift - sra \output, \output - - j 17f # jump to 17f - -1: # per_layer no_bias + relu, 0x001 - bbci \tmp, 0, 0f - - # EE.SRS.ACCX \output, \mac_shift, 0 - tie728_s8_element_round_result \output, \mac_shift, \tmp, \tmp_q1 - - # Relu or LeakyRelu - bgez \output, 16f - mull \output, \output, \activation_alpha - ssr \activation_shift # activation_shift - sra \output, \output - - j 16f # jump to 16f - -0: # per_layer no_bias - # EE.SRS.ACCX \output, \mac_shift, 0 - tie728_s8_element_round_result \output, \mac_shift, \tmp, \tmp_q1 - j 16f # jump to 16f - -17: # update prelu ptr - addi \activation_alpha, \activation_alpha, 1 -16: - clamps \output, \output, 7 - s8i \output, \output_ptr, 0 - addi \output_ptr, \output_ptr, 1 -.endm - - - -.macro tie728_s8_conv2d_1_1_unaligned_c input_v0 input_front_aligned input_back_aligned input_ptr filter_v0 filter_v1 filter_ptr c_div_x_1 remainder_c input_sar - # scalar * vecter and accumulate into QACC - # input_ptr += (c_div_x_1 + 1) * 16 in the end - # filter_ptr point to the next 16 bytes in the end - - # input_v0: 16 input elements - # filter_v0: 16 filter elements - # filter_v1: 16 filter elements - # input_ptr: input_ptr - # filter_ptr: filter_ptr - # c_div_x_1: input_channel // 16 - 1 - - blti \c_div_x_1, 0, 17f # input_channel < 16 - - EE.LD.128.USAR.IP \input_front_aligned, \input_ptr, 16 - - - EE.VLD.128.IP \filter_v0, \filter_ptr, 16 - EE.VLD.128.IP \filter_v1, \filter_ptr, 16 - EE.LD.128.USAR.IP \input_back_aligned, \input_ptr, 16 - - loopgtz \c_div_x_1, 0f - # EE.LD.128.USAR.IP \input_back_aligned, \input_ptr, 16 - EE.SRC.Q.QUP \input_v0, \input_front_aligned, \input_back_aligned - EE.LD.128.USAR.IP \input_back_aligned, \input_ptr, 16 - - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 11 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 12 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 13 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 14 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 15 - - 0: - - - # EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \remainder_c - EE.SRC.Q.QUP \input_v0, \input_front_aligned, \input_back_aligned - addi \input_ptr, \input_ptr, -16 - add \input_ptr, \input_ptr, \remainder_c #input_ptr and the end of c - - - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 11 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 12 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 13 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 14 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 15 - - - beqz \remainder_c, 16f #no c_remainder - - EE.VLD.128.IP \filter_v0, \filter_ptr, 16 - - # rur.sar_byte \input_sar #input sar - # EE.LD.128.USAR.IP \input_back_aligned, \input_ptr, 0 - # wur.sar_byte \input_sar #input sar - EE.VLD.128.IP \input_back_aligned, \input_ptr, 0 - EE.SRC.Q \input_v0, \input_front_aligned, \input_back_aligned - j 15f - - -17: # input_channel < 16 - EE.VLD.128.IP \filter_v0, \filter_ptr, 16 - - EE.LD.128.USAR.XP \input_front_aligned, \input_ptr, \remainder_c - # rur.sar_byte \input_sar #input sar - # EE.LD.128.USAR.IP \input_back_aligned, \input_ptr, 0 - # wur.sar_byte \input_sar #input sar - EE.VLD.128.IP \input_back_aligned, \input_ptr, 0 - EE.SRC.Q \input_v0, \input_front_aligned, \input_back_aligned - - -15: # remainder_c == 15, 0x1111 - bbci \remainder_c, 3, 7f - - EE.VLD.128.IP \filter_v1, \filter_ptr, 16 - - bbci \remainder_c, 2, 11f - bbci \remainder_c, 1, 13f - bbci \remainder_c, 0, 14f - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 11 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 12 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 13 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 14 - j 16f # jump to 16f - -14: # remainder_c == 14, 0x1110 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 11 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 12 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 13 - j 16f # jump to 16f - -13: # remainder_c == 13, 0x1101 - bbci \remainder_c, 0, 12f - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 10 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 11 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 12 - j 16f # jump to 16f - -12: # remainder_c == 12, 0x1100 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 9 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 10 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 11 - j 16f # jump to 16f - -11: # remainder_c == 11, 0x1011 - bbci \remainder_c, 1, 9f - bbci \remainder_c, 0, 10f - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 8 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 9 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 10 - j 16f # jump to 16f -10: # remainder_c == 10, 0x1010 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 7 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 8 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 9 - j 16f # jump to 16f -9: # remainder_c == 9, 0x1001 - bbci \remainder_c, 0, 8f - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 6 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 7 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 8 - j 16f # jump to 16f -8: # remainder_c == 8, 0x1000 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 6 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 7 - j 16f # jump to 16f - -7: # remainder == 7, 0x111 - bbci \remainder_c, 2, 3f - - EE.VLD.128.IP \filter_v1, \filter_ptr, 16 - bbci \remainder_c, 1, 5f - bbci \remainder_c, 0, 6f - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 5 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 6 - j 16f # jump to 16f - -6: # remainder == 6, 0x110 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 4 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 5 - j 16f # jump to 16f - -5: # remainder == 4, 5 - bbci \remainder_c, 0, 4f - # remainder == 5, 0x101 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 3 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 4 - j 16f # jump to 16f - -4: # remainder == 4, 0x100 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v1, \filter_ptr, \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 2 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 3 - j 16f # jump to 16f - -3: # remainder == 1, 2, 3 - bbci \remainder_c, 1, 1f - - EE.VLD.128.IP \filter_v1, \filter_ptr, 16 - - bbci \remainder_c, 0, 2f - # remainder == 3, 0x011 - EE.VSMULAS.S8.QACC.LD.INCP \filter_v0, \filter_ptr, \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 1 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 2 - j 16f # jump to 16f - -2: # remainder == 2, 0x010 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 0 - EE.VSMULAS.S8.QACC \filter_v1, \input_v0, 1 - j 16f # jump to 16f - -1: # remainder == 1, 0x001 - EE.VSMULAS.S8.QACC \filter_v0, \input_v0, 0 - -16: -.endm - - - -.macro tie728_s8_conv2d_1_1_c_n_remainder input_v0 input_front_aligned input_back_aligned input_ptr filter_v0 filter_front_aligned filter_back_aligned filter_ptr c_div_x_1 remainder_c input_sar filter_sar - # scalar * vecter and accumulate into QACC - # input_ptr += (c_div_x_1 + 1) * 16 in the end - # filter_ptr point to the next 16 bytes in the end - - # input_v0: 16 input elements - # filter_v0: 16 filter elements - # filter_v1: 16 filter elements - # input_ptr: input_ptr - # filter_ptr: filter_ptr - # c_div_x_1: input_channel // 16 - 1 - - blti \c_div_x_1, 0, 17f # input_channel < 16 - - EE.LD.128.USAR.IP \input_front_aligned, \input_ptr, 16 - EE.LD.128.USAR.IP \filter_front_aligned, \filter_ptr, 16 - EE.LD.128.USAR.IP \input_back_aligned, \input_ptr, 16 - - loopgtz \c_div_x_1, 0f - EE.SRC.Q.QUP \input_v0, \input_front_aligned, \input_back_aligned - - EE.LD.128.USAR.IP \filter_back_aligned, \filter_ptr, 16 - EE.SRC.Q.QUP \filter_v0, \filter_front_aligned, \filter_back_aligned - - EE.LD.128.USAR.IP \input_back_aligned, \input_ptr, 16 - EE.VMULAS.S8.ACCX \filter_v0, \input_v0 - 0: - - - # EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \remainder_c - EE.SRC.Q.QUP \input_v0, \input_front_aligned, \input_back_aligned - rur.sar_byte \input_sar #input sar - addi \input_ptr, \input_ptr, -16 - - EE.LD.128.USAR.XP \filter_back_aligned, \filter_ptr, \remainder_c - add \input_ptr, \input_ptr, \remainder_c #input_ptr and the end of c - EE.SRC.Q.QUP \filter_v0, \filter_front_aligned, \filter_back_aligned - - EE.VMULAS.S8.ACCX \filter_v0, \input_v0 - - beqz \remainder_c, 16f #no remainder_c - - - # filter remainder is in one 128bit then filter_back_aligned = filter_front_aligned - # rur.sar_byte \filter_sar #filter sar - # EE.LD.128.USAR.IP \filter_back_aligned, \filter_ptr, 0 - # wur.sar_byte \filter_sar #filter sar - EE.VLD.128.IP \filter_back_aligned, \filter_ptr, 0 - EE.SRC.Q \filter_v0, \filter_front_aligned, \filter_back_aligned - - - # input remainder is in one 128bit then input_back_aligned = input_front_aligned - EE.LD.128.USAR.IP \input_back_aligned, \input_ptr, 0 - wur.sar_byte \input_sar #input sar - EE.SRC.Q \input_v0, \input_front_aligned, \input_back_aligned #right shift delete the low sar_byte - - - #left shift to make the rest part(15 - remainder_c) 0 EE.SLXCCP left shift a0+1 - movi \filter_sar, 15 - sub \filter_sar, \filter_sar, \remainder_c - movi \input_sar, 0 - EE.SLCXXP.2Q \input_back_aligned, \input_v0, \filter_sar, \input_sar #left shift to make the rest part 0 - EE.SLCXXP.2Q \filter_back_aligned, \filter_v0, \filter_sar, \input_sar - - EE.VMULAS.S8.ACCX \filter_v0, \input_v0 - - j 16f - -17: # input_channel < 16 - # filter remainder is in one 128bit then filter_back_aligned = filter_front_aligned - EE.LD.128.USAR.XP \filter_front_aligned, \filter_ptr, \remainder_c - # rur.sar_byte \filter_sar #filter sar - # EE.LD.128.USAR.IP \filter_back_aligned, \filter_ptr, 0 - # wur.sar_byte \filter_sar #filter sar - EE.VLD.128.IP \filter_back_aligned, \filter_ptr, 0 - EE.SRC.Q \filter_v0, \filter_front_aligned, \filter_back_aligned - - - # input remainder is in one 128bit then input_back_aligned = input_front_aligned - EE.LD.128.USAR.XP \input_front_aligned, \input_ptr, \remainder_c - # rur.sar_byte \input_sar #input sar - # EE.LD.128.USAR.IP \input_back_aligned, \input_ptr, 0 - # wur.sar_byte \input_sar #input sar - EE.VLD.128.IP \input_back_aligned, \input_ptr, 0 - EE.SRC.Q \input_v0, \input_front_aligned, \input_back_aligned - - - #left shift to make the rest part(15 - remainder_c) 0 EE.SLXCCP left shift a0+1 - movi \filter_sar, 15 - sub \filter_sar, \filter_sar, \remainder_c - movi \input_sar, 0 - EE.SLCXXP.2Q \input_back_aligned, \input_v0, \filter_sar, \input_sar #left shift to make the rest part 0 - EE.SLCXXP.2Q \filter_back_aligned, \filter_v0, \filter_sar, \input_sar - - EE.VMULAS.S8.ACCX \filter_v0, \input_v0 - -16: -.endm - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_conv2d_11cn - .type dl_tie728_s8_unaligned_conv2d_11cn, @function - # .section .iram1 -dl_tie728_s8_unaligned_conv2d_11cn: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs4 / input_sar - # a8: channel_factor - # a9: c_remainder - # a10: operation_type / n_remainder - # a11: bias_ptr - # a12: - # a13: - # a14: input_sar - # a15: moving_input_ptr - - tie728_s8_conv2d_11cn_load_args a4, a5, a6, a7 - # l32i a7, a4, 96 // output_channel_div_8 - # l32i a5, a4, 48 // filter - # l32i a6, a4, 100 // input_channel / x - 1 - l32i a9, a4, 136 # c_remainder: c % 16 - - - - l32i a8, a4, 64 // mac_shift - l32i a11, a4, 68 // bias - #l32i a12, a4, 76 // activation_alpha - l32i a13, a4, 84 // activation_shift - l32i a12, a4, 80 // activation_alpha_ptr - - - tie728_s8_unaligned_conv2d_operation_type a10, a8, a11, a13, a12, a4 - - - beqz a7, tie728_s8_conv2d_1_1_unaligned_c_n_loop_end # output_channel < 16 - - EE.LD.128.USAR.IP q1, a2, 0 - rur.sar_byte a14 - - beqi a14, 0, tie728_s8_conv2d_1_1_unaligned_c_n_loop0 - beqi a14, 8, tie728_s8_unaligned_conv2d_11cn_loop8 - -# output sar_byte != 0 && != 8 - tie728_s8_unaligned_conv2d_11cn_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_unaligned_conv2d_11cn_loop_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_unaligned_conv2d_11cn_loop_no_preload_bias: - tie728_s8_conv2d_1_1_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a9, a14 - tie728_s8_conv2d_1_1_unaligned_c_result a10, q0, a8, a11, a12, a13, a15, q1, q2 - - # store to unaligned address - dl_tie728_s8_unaligned_store0 q0, a2, a14 - - addi a7, a7, -1 - bnez a7, tie728_s8_unaligned_conv2d_11cn_loop - j tie728_s8_conv2d_1_1_unaligned_c_n_loop_end - -# output sar_byte == 0 - tie728_s8_conv2d_1_1_unaligned_c_n_loop0: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_conv2d_1_1_unaligned_c_n_loop0_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_1_1_unaligned_c_n_loop0_no_preload_bias: - tie728_s8_conv2d_1_1_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a9, a14 - tie728_s8_conv2d_1_1_unaligned_c_result a10, q0, a8, a11, a12, a13, a15, q1, q2 - - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_1_1_unaligned_c_n_loop0 - j tie728_s8_conv2d_1_1_unaligned_c_n_loop_end - -# output sar_byte == 8 - tie728_s8_unaligned_conv2d_11cn_loop8: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_unaligned_conv2d_11cn_loop8_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_unaligned_conv2d_11cn_loop8_no_preload_bias: - tie728_s8_conv2d_1_1_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a9, a14 - tie728_s8_conv2d_1_1_unaligned_c_result a10, q0, a8, a11, a12, a13, a15, q1, q2 - - #store to unaligned address - dl_tie728_s8_unaligned_store1 q0, a2 - - addi a7, a7, -1 - bnez a7, tie728_s8_unaligned_conv2d_11cn_loop8 - j tie728_s8_conv2d_1_1_unaligned_c_n_loop_end - - -tie728_s8_conv2d_1_1_unaligned_c_n_loop_end: - - - # handle the n remainder - # s16i a10, a1, 0 - EE.MOVI.32.Q q7, a10, 0 # store operation type in q7[0] - l32i a10, a4, 140 # n % 16 remainder_n - beqz a10, dl_tie728_s8_unaligned_conv2d_11cn_end - - - # n_remainder - tie728_s8_conv2d_1_1_c_unaligned_n_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.ACCX - # complete one n in ACCX - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_conv2d_1_1_c_unaligned_n_loop_no_preload_bias - tie728_s8_conv2d_element_bias a11 - tie728_s8_conv2d_1_1_c_unaligned_n_loop_no_preload_bias: - tie728_s8_conv2d_1_1_c_n_remainder q0, q1, q2, a15, q3, q4, q5, a5, a6, a9, a14, a7 - tie728_s8_conv2d_1_1_n_remainder_result q7, a7, a2, a8, a11, a12, a13, a14, q0 - # l16si a7, a1, 0 - # mov a14, a7 - - # EE.SRS.ACCX a14, a8, 0 - # clamps a14, a14, 7 - # s8i a14, a2, 0 - # addi a2, a2, 1 - - addi a10, a10, -1 - bnez a10, tie728_s8_conv2d_1_1_c_unaligned_n_loop - -dl_tie728_s8_unaligned_conv2d_11cn_end: - - retw - - - - - - - - - - - - - - -############################################################################################################################################################ -#### -#### tie728_s16_conv2d_33cn series -#### -############################################################################################################################################################ -.macro tie728_s8_conv2d_33c16 input_v0 filter_v0 filter_v1 input_ptr filter_ptr c_div_x_1 dilation_x_offset dilation_y_offset - # dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(output_t) - # dilation_y_offset = (dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1)) * sizeof(output_t) - - tie728_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_y_offset - - tie728_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_y_offset - - tie728_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - # add \input_ptr, \input_ptr, \dilation_y_offset -.endm - - - -.macro tie728_s8_conv2d_hwcn_load_args args filter_ptr c_div_x_1 n_rs3 dilation_x_offset dilation_y_offset - l32i \n_rs3, \args, 96 // output_channel_div_8 - l32i \filter_ptr, \args, 48 // filter - l32i \c_div_x_1, \args, 100 // input_channel / x - 1 - l32i \dilation_x_offset, \args, 108 // input dilation x offset - l32i \dilation_y_offset, \args, 112 // input dilation y offset -.endm - - - - .align 4 - .text - .global dl_tie728_s8_conv2d_33cn - .type dl_tie728_s8_conv2d_33cn, @function - # .section .iram1 -dl_tie728_s8_conv2d_33cn: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: bias_ptr - # a12: - # a13: - # a14: - # a15: moving_input_ptr - tie728_s8_conv2d_hwcn_load_args a4, a5, a6, a7, a9, a10 - - l32i a11, a4, 68 # bias - l32i a8, a4, 64 # mac shift - blti a8, 0, dl_tie728_s8_conv2d_per_channel_33cn - - -dl_tie728_s8_conv2d_per_layer_33cn: - - beqz a11, tie728_s8_conv2d_per_layer_33cn_no_bias_loop - - tie728_s8_conv2d_per_layer_33cn_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_33c16 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s8_vector_round_result q0, a8, a15, q3 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_33cn_bias_loop - retw - - - tie728_s8_conv2d_per_layer_33cn_no_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_33c16 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s8_vector_round_result q0, a8, a15, q3 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_33cn_no_bias_loop - retw - - -dl_tie728_s8_conv2d_per_channel_33cn: - l32i a8, a4, 104 # filter_channel_factor address - - beqz a11, tie728_s8_conv2d_per_channel_33cn_no_bias_loop - - tie728_s8_conv2d_per_channel_33cn_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_33c16 q0, q1, q2, a15, a5, a6, a9, a10 - # tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - # tie728_s8_conv2d_bias q0, q1, a11 - tie728_s8_conv2d_per_channel_with_bias_result q0, q1, a8, a11, a15, q2 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_33cn_bias_loop - retw - - - tie728_s8_conv2d_per_channel_33cn_no_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_33c16 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_33cn_no_bias_loop - retw - - - - .align 4 - .text - .global dl_tie728_s8_conv2d_33cn_relu - .type dl_tie728_s8_conv2d_33cn_relu, @function - # .section .iram1 -dl_tie728_s8_conv2d_33cn_relu: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: bias_ptr - # a12: activation_alpha - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s8_conv2d_hwcn_load_args a4, a5, a6, a7, a9, a10 - - l32i a12, a4, 76 # activation_alpha - l32i a13, a4, 84 # activation_shift - - l32i a11, a4, 68 # bias - l32i a8, a4, 64 # mac shift - blti a8, 0, dl_tie728_s8_conv2d_per_channel_33cn_relu - - -dl_tie728_s8_conv2d_per_layer_33cn_relu: - - beqz a11, tie728_s8_conv2d_per_layer_33cn_no_bias_relu_loop - - tie728_s8_conv2d_per_layer_33cn_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_33c16 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s8_vector_round_result q0, a8, a15, q3 - tie728_s8_conv2d_relu q0, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_33cn_bias_relu_loop - retw - - - tie728_s8_conv2d_per_layer_33cn_no_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_33c16 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s8_vector_round_result q0, a8, a15, q3 - tie728_s8_conv2d_relu q0, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_33cn_no_bias_relu_loop - retw - - -dl_tie728_s8_conv2d_per_channel_33cn_relu: - l32i a8, a4, 104 # filter_channel_factor address - - beqz a11, tie728_s8_conv2d_per_channel_33cn_no_bias_relu_loop - - tie728_s8_conv2d_per_channel_33cn_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_33c16 q0, q1, q2, a15, a5, a6, a9, a10 - # tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - # tie728_s8_conv2d_bias_relu q0, q1, a11, a12, a13 - tie728_s8_conv2d_per_channel_with_bias_result q0, q1, a8, a11, a15, q2 - tie728_s8_conv2d_relu q0, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_33cn_bias_relu_loop - retw - - - tie728_s8_conv2d_per_channel_33cn_no_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_33c16 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - tie728_s8_conv2d_relu q0, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_33cn_no_bias_relu_loop - retw - - - - .align 4 - .text - .global dl_tie728_s8_conv2d_33cn_prelu - .type dl_tie728_s8_conv2d_33cn_prelu, @function - # .section .iram1 -dl_tie728_s8_conv2d_33cn_prelu: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: bias_ptr - # a12: activation_alpha - # a13: activation_shift - # a14: - # a15: moving_input_ptr - tie728_s8_conv2d_hwcn_load_args a4, a5, a6, a7, a9, a10 - - l32i a12, a4, 80 # activation_alpha_ptr - l32i a13, a4, 84 # activation_shift - - l32i a11, a4, 68 # bias - l32i a8, a4, 64 # mac shift - blti a8, 0, dl_tie728_s8_conv2d_per_channel_33cn_prelu - - -dl_tie728_s8_conv2d_per_layer_33cn_prelu: - - beqz a11, tie728_s8_conv2d_per_layer_33cn_no_bias_prelu_loop - - tie728_s8_conv2d_per_layer_33cn_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_33c16 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s8_vector_round_result q0, a8, a15, q3 - tie728_s8_conv2d_prelu q0, q2, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_33cn_bias_prelu_loop - retw - - - tie728_s8_conv2d_per_layer_33cn_no_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_33c16 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s8_vector_round_result q0, a8, a15, q3 - tie728_s8_conv2d_prelu q0, q2, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_33cn_no_bias_prelu_loop - retw - - -dl_tie728_s8_conv2d_per_channel_33cn_prelu: - l32i a8, a4, 104 # filter_channel_factor address - - beqz a11, tie728_s8_conv2d_per_channel_33cn_no_bias_prelu_loop - - tie728_s8_conv2d_per_channel_33cn_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_33c16 q0, q1, q2, a15, a5, a6, a9, a10 - # tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - # tie728_s8_conv2d_bias_prelu q0, q1, a11, q2, a12, a13 - tie728_s8_conv2d_per_channel_with_bias_result q0, q1, a8, a11, a15, q2 - tie728_s8_conv2d_prelu q0, q2, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_33cn_bias_prelu_loop - retw - - - tie728_s8_conv2d_per_channel_33cn_no_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_33c16 q0, q1, q2, a15, a5, a6, a9, a10 - tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - tie728_s8_conv2d_prelu q0, q2, a12, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_33cn_no_bias_prelu_loop - retw - - - - -############################################################################################################################################################ -#### -#### tie728_s8_unaligned_conv2d_33cn -#### -############################################################################################################################################################ - - -.macro tie728_s8_conv2d_3_3_unaligned_c input_v0 input_front_aligned input_back_aligned input_ptr filter_v0 filter_v1 filter_ptr c_div_x_1 dilation_x_offset dilation_y_offset remainder_c input_sar - # dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(output_t) - # dilation_y_offset = (dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1)) * sizeof(output_t) - - tie728_s8_conv2d_1_1_unaligned_c \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_1_1_unaligned_c \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_1_1_unaligned_c \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar - add \input_ptr, \input_ptr, \dilation_y_offset - - tie728_s8_conv2d_1_1_unaligned_c \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_1_1_unaligned_c \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_1_1_unaligned_c \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar - add \input_ptr, \input_ptr, \dilation_y_offset - - tie728_s8_conv2d_1_1_unaligned_c \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_1_1_unaligned_c \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_1_1_unaligned_c \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar - # add \input_ptr, \input_ptr, \dilation_y_offset -.endm - - - - -.macro tie728_s8_conv2d_3_3_c_n_remainder input_v0 input_front_aligned input_back_aligned input_ptr filter_v0 filter_front_aligned filter_back_aligned filter_ptr c_div_x_1 dilation_x_offset dilation_y_offset remainder_c input_sar filter_sar - - tie728_s8_conv2d_1_1_c_n_remainder \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar, \filter_sar - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_1_1_c_n_remainder \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar, \filter_sar - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_1_1_c_n_remainder \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar, \filter_sar - add \input_ptr, \input_ptr, \dilation_y_offset - - tie728_s8_conv2d_1_1_c_n_remainder \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar, \filter_sar - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_1_1_c_n_remainder \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar, \filter_sar - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_1_1_c_n_remainder \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar, \filter_sar - add \input_ptr, \input_ptr, \dilation_y_offset - - tie728_s8_conv2d_1_1_c_n_remainder \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar, \filter_sar - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_1_1_c_n_remainder \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar, \filter_sar - add \input_ptr, \input_ptr, \dilation_x_offset - - tie728_s8_conv2d_1_1_c_n_remainder \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \c_div_x_1, \remainder_c, \input_sar, \filter_sar - # add \input_ptr, \input_ptr, \dilation_y_offset -.endm - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_conv2d_33cn - .type dl_tie728_s8_unaligned_conv2d_33cn, @function - # .section .iram1 -dl_tie728_s8_unaligned_conv2d_33cn: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs4 / input_sar - # a8: channel_factor - # a9: input dilation x offset - # a10: input dilation y offset - # a10: / c_remainder - # a11: bias_ptr - # a12: activation_alpha - # a13: activation_shift / n_remainder - # a14: tmp variable: input_sar / operation_type q7[0] - # a15: moving_input_ptr - - tie728_s8_conv2d_hwcn_load_args a4, a5, a6, a7, a9, a10 - # l32i a7, a4, 96 // output_channel_div_8 - # l32i a5, a4, 48 // filter - # l32i a6, a4, 100 // input_channel / x - 1 - - l32i a8, a4, 64 // mac_shift - l32i a11, a4, 68 // bias - #l32i a12, a4, 76 // activation_alpha - l32i a13, a4, 84 // activation_shift - l32i a12, a4, 80 // activation_alpha_ptr - - tie728_s8_unaligned_conv2d_operation_type a14, a8, a11, a13, a12, a4 - EE.MOVI.32.Q q7, a14, 0 # operation_type q7[0] - - beqz a7, tie728_s8_conv2d_3_3_unaligned_c_n_loop_end # output_channel < 16 - - EE.LD.128.USAR.IP q1, a2, 0 # output sar_byte - rur.sar_byte a14 - - bgez a13, tie728_s8_conv2d_3_3_unaligned_c_n_activation - l32i a13, a4, 136 # c_remainder: c % 16, replace activation_shift - EE.MOVI.32.A q7, a12, 0 # operation_type, replace activation_alpha - - beqi a14, 0, tie728_s8_conv2d_3_3_unaligned_c_n_loop0 - beqi a14, 8, tie728_s8_conv2d_3_3_unaligned_c_n_loop8 - - # output sar_byte != 0 && != 8 - tie728_s8_conv2d_3_3_unaligned_c_n_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_conv2d_3_3_unaligned_c_n_loop_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_3_3_unaligned_c_n_loop_no_preload_bias: - tie728_s8_conv2d_3_3_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a9, a10, a13, a14 - tie728_s8_conv2d_1_1_unaligned_c_result a12, q0, a8, a11, a12, a13, a15, q1, q2 - - #store to unaligned address - dl_tie728_s8_unaligned_store0 q0, a2, a14 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_3_3_unaligned_c_n_loop - j tie728_s8_conv2d_3_3_unaligned_c_n_loop_end - - # output sar_byte == 0 - tie728_s8_conv2d_3_3_unaligned_c_n_loop0: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_conv2d_3_3_unaligned_c_n_loop0_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_3_3_unaligned_c_n_loop0_no_preload_bias: - tie728_s8_conv2d_3_3_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a9, a10, a13, a14 - tie728_s8_conv2d_1_1_unaligned_c_result a12, q0, a8, a11, a12, a13, a15, q1, q2 - - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_3_3_unaligned_c_n_loop0 - j tie728_s8_conv2d_3_3_unaligned_c_n_loop_end - - # output sar_byte == 8 - tie728_s8_conv2d_3_3_unaligned_c_n_loop8: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_conv2d_3_3_unaligned_c_n_loop8_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_3_3_unaligned_c_n_loop8_no_preload_bias: - tie728_s8_conv2d_3_3_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a9, a10, a13, a14 - tie728_s8_conv2d_1_1_unaligned_c_result a12, q0, a8, a11, a12, a13, a15, q1, q2 - - #store to unaligned address - dl_tie728_s8_unaligned_store1 q0, a2 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_3_3_unaligned_c_n_loop8 - j tie728_s8_conv2d_3_3_unaligned_c_n_loop_end - - tie728_s8_conv2d_3_3_unaligned_c_n_activation: - beqi a14, 0, tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop0 - beqi a14, 8, tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop8 - - # output sar_byte != 0 && != 8 - tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop_no_preload_bias: - l32i a13, a4, 136 # c_remainder: c % 16, replace activation_shift - tie728_s8_conv2d_3_3_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a9, a10, a13, a14 - - EE.MOVI.32.A q7, a14, 0 # operation_type - l32i a13, a4, 84 # activation_shift - tie728_s8_conv2d_1_1_unaligned_c_result a14, q0, a8, a11, a12, a13, a15, q1, q2 - - #store to unaligned address - dl_tie728_s8_unaligned_store0 q0, a2, a14 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop - j tie728_s8_conv2d_3_3_unaligned_c_n_loop_end - - # output sar_byte == 0 - tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop0: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop0_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop0_no_preload_bias: - l32i a13, a4, 136 # c_remainder: c % 16, replace activation_shift - tie728_s8_conv2d_3_3_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a9, a10, a13, a14 - - EE.MOVI.32.A q7, a14, 0 # operation_type - l32i a13, a4, 84 # activation_shift - tie728_s8_conv2d_1_1_unaligned_c_result a14, q0, a8, a11, a12, a13, a15, q1, q2 - - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop0 - j tie728_s8_conv2d_3_3_unaligned_c_n_loop_end - - # output sar_byte == 8 - tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop8: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop8_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop8_no_preload_bias: - l32i a13, a4, 136 # c_remainder: c % 16, replace activation_shift - tie728_s8_conv2d_3_3_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a9, a10, a13, a14 - - EE.MOVI.32.A q7, a14, 0 # operation_type - l32i a13, a4, 84 # activation_shift - tie728_s8_conv2d_1_1_unaligned_c_result a14, q0, a8, a11, a12, a13, a15, q1, q2 - - #store to unaligned address - dl_tie728_s8_unaligned_store1 q0, a2 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_3_3_unaligned_c_n_activation_loop8 - j tie728_s8_conv2d_3_3_unaligned_c_n_loop_end - -tie728_s8_conv2d_3_3_unaligned_c_n_loop_end: - - - # handle the n remainder - l32i a13, a4, 140 # n % 16 remainder_n - beqz a13, dl_tie728_s8_unaligned_conv2d_33cn_end - - l32i a14, a4, 84 # activation_shift - - bgez a14, tie728_s8_conv2d_3_3_c_unaligned_n_activation - l32i a12, a4, 136 # c_remainder: c % 16, replace activation_shift - # n_remainder - tie728_s8_conv2d_3_3_c_unaligned_n_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.ACCX - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_conv2d_3_3_c_unaligned_n_loop_no_preload_bias - tie728_s8_conv2d_element_bias a11 - tie728_s8_conv2d_3_3_c_unaligned_n_loop_no_preload_bias: - - # complete one n in ACCX - tie728_s8_conv2d_3_3_c_n_remainder q0, q1, q2, a15, q3, q4, q5, a5, a6, a9, a10, a12, a14, a7 - tie728_s8_conv2d_1_1_n_remainder_result q7, a7, a2, a8, a11, a12, a13, a14, q0 - - addi a13, a13, -1 - bnez a13, tie728_s8_conv2d_3_3_c_unaligned_n_loop - j dl_tie728_s8_unaligned_conv2d_33cn_end - - tie728_s8_conv2d_3_3_c_unaligned_n_activation: - EE.MOVI.32.Q q6, a13, 0 # store remainder_n in q6[0] - - # n_remainder - tie728_s8_conv2d_3_3_c_unaligned_n_activation_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.ACCX - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_conv2d_3_3_c_unaligned_n_activation_loop_no_preload_bias - tie728_s8_conv2d_element_bias a11 - tie728_s8_conv2d_3_3_c_unaligned_n_activation_loop_no_preload_bias: - - # complete one n in ACCX - l32i a13, a4, 136 # c_remainder: c % 16, replace activation_shift - tie728_s8_conv2d_3_3_c_n_remainder q0, q1, q2, a15, q3, q4, q5, a5, a6, a9, a10, a13, a14, a7 - - l32i a13, a4, 84 # activation_shift - tie728_s8_conv2d_1_1_n_remainder_result q7, a7, a2, a8, a11, a12, a13, a14, q0 - - EE.MOVI.32.A q6, a13, 0 # remainder_n in q6[0] - addi a13, a13, -1 - EE.MOVI.32.Q q6, a13, 0 - bnez a13, tie728_s8_conv2d_3_3_c_unaligned_n_activation_loop - - -dl_tie728_s8_unaligned_conv2d_33cn_end: - - retw - - - - - - - - -############################################################################################################################################################ -#### -#### tie728_s8_conv2d_hwcn series -#### -############################################################################################################################################################ - -.macro tie728_s8_conv2d_hwc16 input_v0 input_ptr filter_v0 filter_v1 filter_ptr c_div_x_1 dilation_x_offset dilation_y_offset filter_h filter_w filter_offset_q, args - # dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(output_t) - # dilation_y_offset = (dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1)) * sizeof(output_t) - # filter_h - # filter_w - - l32i \filter_w, \args, 56 # filter_width - blti \filter_w, 2, 3f - - l32i \filter_h, \args, 52 # filter_height - 1: - l32i \filter_w, \args, 56 # filter_width - 2: - tie728_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \input_ptr, \input_ptr, \dilation_x_offset - - addi \filter_w, \filter_w, -1 - bgei \filter_w, 2, 2b - - tie728_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - - EE.MOVI.32.A \filter_offset_q, \filter_w, 1 - add \filter_ptr, \filter_ptr, \filter_w - add \input_ptr, \input_ptr, \dilation_y_offset - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 1b - j 5f - - # filter_w == 1 - 3: - l32i \filter_h, \args, 52 # filter_height - EE.MOVI.32.A \filter_offset_q, \filter_w, 1 - 4: - tie728_s8_conv2d_11c16 \input_v0, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1 - add \filter_ptr, \filter_ptr, \filter_w - add \input_ptr, \input_ptr, \dilation_y_offset - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 4b - - 5: - EE.MOVI.32.A \filter_offset_q, \filter_h, 2 - add \filter_ptr, \filter_ptr, \filter_h -.endm - - - - .align 4 - .text - .global dl_tie728_s8_conv2d_hwcn - .type dl_tie728_s8_conv2d_hwcn, @function - # .section .iram1 -dl_tie728_s8_conv2d_hwcn: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: filter_height - # a12: filter_width - # a13: bias_ptr - # a14: filter_y_offset - # a15: moving_input_ptr - tie728_s8_conv2d_hwcn_load_args a4, a5, a6, a7, a9, a10 - - l32i a13, a4, 68 # bias - l32i a8, a4, 64 # mac shift - l32i a12, a4, 60 # filter_y_offset - l32i a11, a4, 144 - EE.MOVI.32.Q q6, a12, 1 #filter_y_offset - EE.MOVI.32.Q q6, a11, 2 #filter_n_offset - -dl_tie728_s8_conv2d_per_layer_hwcn: - - beqz a13, tie728_s8_conv2d_per_layer_hwcn_no_bias_loop - - tie728_s8_conv2d_per_layer_hwcn_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_128b_vector_bias a13 - tie728_s8_conv2d_hwc16 q0, a15, q1, q2, a5, a6, a9, a10, a11, a12, q6, a4 - tie728_s8_vector_round_result q0, a8, a15, q3 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_hwcn_bias_loop - retw - - - tie728_s8_conv2d_per_layer_hwcn_no_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_hwc16 q0, a15, q1, q2, a5, a6, a9, a10, a11, a12, q6, a4 - tie728_s8_vector_round_result q0, a8, a15, q3 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_hwcn_no_bias_loop - retw - - -dl_tie728_s8_conv2d_per_channel_hwcn: - l32i a8, a4, 104 # filter_channel_factor address - - beqz a13, tie728_s8_conv2d_per_channel_hwcn_no_bias_loop - - tie728_s8_conv2d_per_channel_hwcn_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_hwc16 q0, a15, q1, q2, a5, a6, a9, a10, a11, a12, q6, a4 - # tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - # tie728_s8_conv2d_bias q0, q1, a13 - tie728_s8_conv2d_per_channel_with_bias_result q0, q1, a8, a13, a15, q2 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_hwcn_bias_loop - retw - - - tie728_s8_conv2d_per_channel_hwcn_no_bias_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_hwc16 q0, a15, q1, q2, a5, a6, a9, a10, a11, a12, q6, a4 - tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_hwcn_no_bias_loop - retw - - - - .align 4 - .text - .global dl_tie728_s8_conv2d_hwcn_relu - .type dl_tie728_s8_conv2d_hwcn_relu, @function - # .section .iram1 -dl_tie728_s8_conv2d_hwcn_relu: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs3 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: filter_height / activation_shift - # a12: filter_width - # a13: bias_ptr - # a14: activation_alpha - # a15: moving_input_ptr - tie728_s8_conv2d_hwcn_load_args a4, a5, a6, a7, a9, a10 - - l32i a14, a4, 76 # activation_alpha - # l32i a13, a4, 84 # activation_shift - - l32i a13, a4, 68 # bias - l32i a8, a4, 64 # mac shift - l32i a12, a4, 60 # filter_y_offset - l32i a11, a4, 144 # filter_n_offset - EE.MOVI.32.Q q6, a12, 1 - EE.MOVI.32.Q q6, a11, 2 - blti a8, 0, dl_tie728_s8_conv2d_per_channel_hwcn_relu - - -dl_tie728_s8_conv2d_per_layer_hwcn_relu: - - beqz a13, tie728_s8_conv2d_per_layer_hwcn_no_bias_relu - - tie728_s8_conv2d_per_layer_hwcn_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_128b_vector_bias a13 - tie728_s8_conv2d_hwc16 q0, a15, q1, q2, a5, a6, a9, a10, a11, a12, q6, a4 - tie728_s8_vector_round_result q0, a8, a15, q3 - l32i a11, a4, 84 # activation_shift - tie728_s8_conv2d_relu q0, a14, a11 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_hwcn_bias_relu_loop - retw - - - tie728_s8_conv2d_per_layer_hwcn_no_bias_relu: - l32i a13, a4, 84 # activation_shift - - tie728_s8_conv2d_per_layer_hwcn_no_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_hwc16 q0, a15, q1, q2, a5, a6, a9, a10, a11, a12, q6, a4 - tie728_s8_vector_round_result q0, a8, a15, q3 - tie728_s8_conv2d_relu q0, a14, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_hwcn_no_bias_relu_loop - retw - - -dl_tie728_s8_conv2d_per_channel_hwcn_relu: - l32i a8, a4, 104 # filter_channel_factor address - - beqz a13, tie728_s8_conv2d_per_channel_hwcn_no_bias_relu - - tie728_s8_conv2d_per_channel_hwcn_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_hwc16 q0, a15, q1, q2, a5, a6, a9, a10, a11, a12, q6, a4 - # tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - # l32i a11, a4, 84 # activation_shift - # tie728_s8_conv2d_bias_relu q0, q1, a13, a14, a11 - tie728_s8_conv2d_per_channel_with_bias_result q0, q1, a8, a13, a15, q2 - - l32i a11, a4, 84 # activation_shift - tie728_s8_conv2d_relu q0, a14, a11 - - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_hwcn_bias_relu_loop - retw - - - tie728_s8_conv2d_per_channel_hwcn_no_bias_relu: - l32i a13, a4, 84 # activation_shift - - tie728_s8_conv2d_per_channel_hwcn_no_bias_relu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_hwc16 q0, a15, q1, q2, a5, a6, a9, a10, a11, a12, q6, a4 - tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - tie728_s8_conv2d_relu q0, a14, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_hwcn_no_bias_relu_loop - retw - - - - .align 4 - .text - .global dl_tie728_s8_conv2d_hwcn_prelu - .type dl_tie728_s8_conv2d_hwcn_prelu, @function - # .section .iram1 -dl_tie728_s8_conv2d_hwcn_prelu: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs4 - # a8: mac_shift - # a9: input dilation x offset - # a10: input dilation y offset - # a11: filter_height / activation_shift - # a12: filter_width - # a13: bias_ptr - # a14: activation_alpha - # a15: moving_input_ptr - tie728_s8_conv2d_hwcn_load_args a4, a5, a6, a7, a9, a10 - - l32i a14, a4, 80 # activation_alpha_ptr - # l32i a13, a4, 84 # activation_shift - - l32i a13, a4, 68 # bias - l32i a8, a4, 64 # mac shift - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q6, a12, 1 - EE.MOVI.32.Q q6, a11, 2 - blti a8, 0, dl_tie728_s8_conv2d_per_channel_hwcn_prelu - - -dl_tie728_s8_conv2d_per_layer_hwcn_prelu: - - beqz a13, tie728_s8_conv2d_per_layer_hwcn_no_bias_prelu - - tie728_s8_conv2d_per_layer_hwcn_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_128b_vector_bias a13 - tie728_s8_conv2d_hwc16 q0, a15, q1, q2, a5, a6, a9, a10, a11, a12, q6, a4 - tie728_s8_vector_round_result q0, a8, a15, q3 - l32i a11, a4, 84 # activation_shift - tie728_s8_conv2d_prelu q0, q2, a14, a11 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_hwcn_bias_prelu_loop - retw - - - tie728_s8_conv2d_per_layer_hwcn_no_bias_prelu: - l32i a13, a4, 84 # activation_shift - - tie728_s8_conv2d_per_layer_hwcn_no_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_hwc16 q0, a15, q1, q2, a5, a6, a9, a10, a11, a12, q6, a4 - tie728_s8_vector_round_result q0, a8, a15, q3 - tie728_s8_conv2d_prelu q0, q2, a14, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_layer_hwcn_no_bias_prelu_loop - retw - - -dl_tie728_s8_conv2d_per_channel_hwcn_prelu: - l32i a8, a4, 104 # filter_channel_factor address - - beqz a13, tie728_s8_conv2d_per_channel_hwcn_no_bias_prelu - - tie728_s8_conv2d_per_channel_hwcn_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_hwc16 q0, a15, q1, q2, a5, a6, a9, a10, a11, a12, q6, a4 - # tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - # l32i a11, a4, 84 # activation_shift - # tie728_s8_conv2d_bias_prelu q0, q1, a13, q2, a14, a11 - tie728_s8_conv2d_per_channel_with_bias_result q0, q1, a8, a13, a15, q2 - l32i a11, a4, 84 # activation_shift - tie728_s8_conv2d_prelu q0, q2, a14, a11 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_hwcn_bias_prelu_loop - retw - - - tie728_s8_conv2d_per_channel_hwcn_no_bias_prelu: - l32i a13, a4, 84 # activation_shift - - tie728_s8_conv2d_per_channel_hwcn_no_bias_prelu_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - tie728_s8_conv2d_hwc16 q0, a15, q1, q2, a5, a6, a9, a10, a11, a12, q6, a4 - tie728_s8_conv2d_per_channel_result q0, q1, a8, a15, q2 - tie728_s8_conv2d_prelu q0, q2, a14, a13 - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_per_channel_hwcn_no_bias_prelu_loop - retw - - - - - -############################################################################################################################################################ -#### -#### tie728_s8_unaligned_conv2d_hwcn series -#### -############################################################################################################################################################ - - - -.macro tie728_s8_conv2d_h_w_unaligned_c input_v0 input_front_aligned input_back_aligned input_ptr filter_v0 filter_v1 filter_ptr c_div_x_1 filter_h filter_w args remainder_c tmp filter_offset_q - # dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(output_t) - # dilation_y_offset = (dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1)) * sizeof(output_t) - # filter_h - # filter_w - - l32i \filter_w, \args, 56 # filter_width - blti \filter_w, 2, 23f - - - l32i \filter_h, \args, 52 # filter_height - 21: - l32i \filter_w, \args, 56 # filter_width - 22: - tie728_s8_conv2d_1_1_unaligned_c \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \remainder_c, \tmp - l32i \tmp, \args, 108 # input dilation x offset - add \input_ptr, \input_ptr, \tmp - - - addi \filter_w, \filter_w, -1 - bgei \filter_w, 2, 22b - - tie728_s8_conv2d_1_1_unaligned_c \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \remainder_c, \tmp - l32i \tmp, \args, 112 # input dilation y offset - add \input_ptr, \input_ptr, \tmp - EE.MOVI.32.A \filter_offset_q, \filter_w, 1 - add \filter_ptr, \filter_ptr, \filter_w - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 21b - j 25f - - # filter_w == 1 - 23: - l32i \filter_h, \args, 52 # filter_height - EE.MOVI.32.A \filter_offset_q, \filter_w, 1 - 24: - tie728_s8_conv2d_1_1_unaligned_c \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_v1, \filter_ptr, \c_div_x_1, \remainder_c, \tmp - l32i \tmp, \args, 112 # input dilation y offset - add \input_ptr, \input_ptr, \tmp - add \filter_ptr, \filter_ptr, \filter_w - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 24b - -25: - EE.MOVI.32.A \filter_offset_q, \filter_h, 2 - add \filter_ptr, \filter_ptr, \filter_h -.endm - - - - -.macro tie728_s8_conv2d_h_w_c_n_remainder input_v0 input_front_aligned input_back_aligned input_ptr filter_v0 filter_front_aligned filter_back_aligned filter_ptr c_div_x_1 args filter_h filter_w remainder_c tmp tmp1 filter_offset_q - # dilation_x_offset = (dilation_x * input_channel_with_padding - input_channel) * sizeof(output_t) - # dilation_y_offset = (dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1)) * sizeof(output_t) - # filter_h - # filter_w - - l32i \filter_w, \args, 56 # filter_width - blti \filter_w, 2, 22f - - l32i \filter_h, \args, 52 # filter_height - 21: - l32i \filter_w, \args, 56 # filter_width - 20: - tie728_s8_conv2d_1_1_c_n_remainder \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \c_div_x_1, \remainder_c, \tmp, \tmp1 - l32i \tmp, \args, 108 # input dilation x offset - add \input_ptr, \input_ptr, \tmp - - - addi \filter_w, \filter_w, -1 - bgei \filter_w, 2, 20b - - tie728_s8_conv2d_1_1_c_n_remainder \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \c_div_x_1, \remainder_c, \tmp, \tmp1 - l32i \tmp, \args, 112 # input dilation y offset - add \input_ptr, \input_ptr, \tmp - EE.MOVI.32.A \filter_offset_q, \filter_w, 1 - add \filter_ptr, \filter_ptr, \filter_w - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 21b - j 24f - - # filter_w == 1 - 22: - l32i \filter_h, \args, 52 # filter_height - EE.MOVI.32.A \filter_offset_q, \filter_w, 1 - 23: - tie728_s8_conv2d_1_1_c_n_remainder \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \c_div_x_1, \remainder_c, \tmp, \tmp1 - l32i \tmp, \args, 112 # input dilation y offset - add \input_ptr, \input_ptr, \tmp - add \filter_ptr, \filter_ptr, \filter_w - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 23b - -24: - EE.MOVI.32.A \filter_offset_q, \filter_h, 2 - add \filter_ptr, \filter_ptr, \filter_h -.endm - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_conv2d_hwcn - .type dl_tie728_s8_unaligned_conv2d_hwcn, @function - # .section .iram1 -dl_tie728_s8_unaligned_conv2d_hwcn: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: c_div_x_1 - # a7: n_rs4 - # a8: mac_shift - # a9: remainder_c - # a10: tmp / input dilation x offset / input dilation y offset - # a11: filter_height / activation_shift - # a12: filter_width - # a13: bias_ptr - # a14: activation_alpha_ptr - # a15: moving_input_ptr - - tie728_s8_conv2d_hwcn_load_args a4, a5, a6, a7, a9, a10 - # l32i a7, a4, 96 // output_channel_div_8 - # l32i a5, a4, 48 // filter - # l32i a6, a4, 100 // input_channel / x - 1 - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q6, a12, 1 - EE.MOVI.32.Q q6, a11, 2 - - l32i a8, a4, 64 // mac_shift - l32i a13, a4, 68 // bias - #l32i a14, a4, 76 // activation_alpha - l32i a11, a4, 84 // activation_shift - l32i a14, a4, 80 // activation_alpha_ptr - - tie728_s8_unaligned_conv2d_operation_type a12, a8, a13, a11, a14, a4 - EE.MOVI.32.Q q7, a12, 0 # operation_type q7[0] - - beqz a7, tie728_s8_conv2d_h_w_unaligned_c_n_loop_end # output_channel < 16 - - EE.LD.128.USAR.IP q1, a2, 0 # output sar_byte - rur.sar_byte a12 - - bgez a11, tie728_s8_conv2d_h_w_unaligned_c_n_activation - l32i a9, a4, 136 # c_remainder: c % 16 - EE.MOVI.32.A q7, a10, 0 # operation_type - - beqi a12, 0, tie728_s8_conv2d_h_w_unaligned_c_n_loop0 - beqi a12, 8, tie728_s8_conv2d_h_w_unaligned_c_n_loop8 - - # output sar_byte != 0 && != 8 - tie728_s8_conv2d_h_w_unaligned_c_n_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a13, tie728_s8_conv2d_h_w_unaligned_c_n_loop_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a13 - tie728_s8_conv2d_h_w_unaligned_c_n_loop_no_preload_bias: - tie728_s8_conv2d_h_w_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a11, a12, a4, a9, a14, q6 - tie728_s8_conv2d_1_1_unaligned_c_result a10, q0, a8, a13, a11, a11, a11, q1, q2 - - #store to unaligned address - dl_tie728_s8_unaligned_store0 q0, a2, a14 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_h_w_unaligned_c_n_loop - j tie728_s8_conv2d_h_w_unaligned_c_n_loop_end - - # output sar_byte == 0 - tie728_s8_conv2d_h_w_unaligned_c_n_loop0: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a13, tie728_s8_conv2d_h_w_unaligned_c_n_loop0_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a13 - tie728_s8_conv2d_h_w_unaligned_c_n_loop0_no_preload_bias: - tie728_s8_conv2d_h_w_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a11, a12, a4, a9, a14, q6 - tie728_s8_conv2d_1_1_unaligned_c_result a10, q0, a8, a13, a11, a11, a11, q1, q2 - - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_h_w_unaligned_c_n_loop0 - j tie728_s8_conv2d_h_w_unaligned_c_n_loop_end - - # output sar_byte == 8 - tie728_s8_conv2d_h_w_unaligned_c_n_loop8: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a13, tie728_s8_conv2d_h_w_unaligned_c_n_loop8_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a13 - tie728_s8_conv2d_h_w_unaligned_c_n_loop8_no_preload_bias: - tie728_s8_conv2d_h_w_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a11, a12, a4, a9, a14, q6 - tie728_s8_conv2d_1_1_unaligned_c_result a10, q0, a8, a13, a11, a11, a11, q1, q2 - - #store to unaligned address - dl_tie728_s8_unaligned_store1 q0, a2 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_h_w_unaligned_c_n_loop8 - j tie728_s8_conv2d_h_w_unaligned_c_n_loop_end - - - tie728_s8_conv2d_h_w_unaligned_c_n_activation: - l32i a9, a4, 136 # c_remainder: c % 16 - - EE.LD.128.USAR.IP q1, a2, 0 # output sar_byte - rur.sar_byte a12 - beqi a12, 0, tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop0 - beqi a12, 8, tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop8 - - # output sar_byte != 0 && != 8 - tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a13, tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a13 - tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop_no_preload_bias: - tie728_s8_conv2d_h_w_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a11, a12, a4, a9, a10, q6 - EE.MOVI.32.A q7, a10, 0 # operation_type - l32i a11, a4, 84 # activation_shift - tie728_s8_conv2d_1_1_unaligned_c_result a10, q0, a8, a13, a14, a11, a15, q1, q2 - - #store to unaligned address - dl_tie728_s8_unaligned_store0 q0, a2, a11 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop - j tie728_s8_conv2d_h_w_unaligned_c_n_loop_end - - # output sar_byte == 0 - tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop0: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a13, tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop0_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a13 - tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop0_no_preload_bias: - tie728_s8_conv2d_h_w_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a11, a12, a4, a9, a10, q6 - EE.MOVI.32.A q7, a10, 0 # operation_type - l32i a11, a4, 84 # activation_shift - tie728_s8_conv2d_1_1_unaligned_c_result a10, q0, a8, a13, a14, a11, a15, q1, q2 - - EE.VST.128.IP q0, a2, 16 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop0 - j tie728_s8_conv2d_h_w_unaligned_c_n_loop_end - - # output sar_byte == 8 - tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop8: - mov a15, a3 # reload input_ptr - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a13, tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop8_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a13 - tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop8_no_preload_bias: - tie728_s8_conv2d_h_w_unaligned_c q0, q1, q2, a15, q3, q4, a5, a6, a11, a12, a4, a9, a10, q6 - EE.MOVI.32.A q7, a10, 0 # operation_type - l32i a11, a4, 84 # activation_shift - tie728_s8_conv2d_1_1_unaligned_c_result a10, q0, a8, a13, a14, a11, a15, q1, q2 - - #store to unaligned address - dl_tie728_s8_unaligned_store1 q0, a2 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_h_w_unaligned_c_n_activation_loop8 - j tie728_s8_conv2d_h_w_unaligned_c_n_loop_end - -tie728_s8_conv2d_h_w_unaligned_c_n_loop_end: - - - # handle the n remainder - l32i a7, a4, 140 # n % 16 remainder_n - beqz a7, dl_tie728_s8_unaligned_conv2d_hwcn_end - - l32i a5, a4, 160 - l32i a11, a4, 164 - EE.MOVI.32.Q q6, a5, 1 - EE.MOVI.32.Q q6, a11, 2 - l32i a5, a4, 168 # filter_ptr unaligned - - l32i a11, a4, 84 # activation_shift - l32i a9, a4, 136 # c_remainder: c % 16 - - - bgez a11, tie728_s8_conv2d_h_w_c_unaligned_n_activation - # n_remainder - tie728_s8_conv2d_h_w_c_unaligned_n_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.ACCX - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a13, tie728_s8_conv2d_h_w_c_unaligned_n_loop_no_preload_bias - tie728_s8_conv2d_element_bias a13 - tie728_s8_conv2d_h_w_c_unaligned_n_loop_no_preload_bias: - - # complete one n in ACCX - tie728_s8_conv2d_h_w_c_n_remainder q0, q1, q2, a15, q3, q4, q5, a5, a6, a4, a11, a12, a9, a10, a14, q6 - tie728_s8_conv2d_1_1_n_remainder_result q7, a10, a2, a8, a13, a11, a11, a11, q0 - - addi a7, a7, -1 - bnez a7, tie728_s8_conv2d_h_w_c_unaligned_n_loop - j dl_tie728_s8_unaligned_conv2d_hwcn_end - - tie728_s8_conv2d_h_w_c_unaligned_n_activation: - EE.MOVI.32.Q q6, a7, 0 # store remainder_n in q6[0] - - # n_remainder - tie728_s8_conv2d_h_w_c_unaligned_n_activation_loop: - mov a15, a3 # reload input_ptr - EE.ZERO.ACCX - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a13, tie728_s8_conv2d_h_w_c_unaligned_n_activation_loop_no_preload_bias - tie728_s8_conv2d_element_bias a13 - tie728_s8_conv2d_h_w_c_unaligned_n_activation_loop_no_preload_bias: - - # complete one n in ACCX - tie728_s8_conv2d_h_w_c_n_remainder q0, q1, q2, a15, q3, q4, q5, a5, a6, a4, a11, a12, a9, a10, a7, q6 - - l32i a11, a4, 84 # activation_shift - tie728_s8_conv2d_1_1_n_remainder_result q7, a10, a2, a8, a13, a14, a11, a15, q0 - - EE.MOVI.32.A q6, a7, 0 - addi a7, a7, -1 - EE.MOVI.32.Q q6, a7, 0 # store remainder_n in q6[0] - bnez a7, tie728_s8_conv2d_h_w_c_unaligned_n_activation_loop - - -dl_tie728_s8_unaligned_conv2d_hwcn_end: - - retw diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_depthwise_conv2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_depthwise_conv2d.S deleted file mode 100644 index 94b98307..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_depthwise_conv2d.S +++ /dev/null @@ -1,1638 +0,0 @@ -#include "dl_tie728_s8.S" - - -############################################################################################################################################################ -#### -#### tie728_s8_depthwise_conv2d_33c1 series -#### -############################################################################################################################################################ -.macro tie728_s8_depthwise_conv2d_33s1 input_v0 filter_v0 input_v1 filter_v1 input_v2 filter_v2 input_ptr filter_ptr dilation_x_offset dilation_y_offset next_33s1 - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - # next_33s1 = (-(filter_width - 1) * dilation_x * input_channel_with_padding - (filter_height - 1) * dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) + 16 - - # EE.ZERO.QACC - - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v2, \input_ptr, \dilation_y_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v2, \input_ptr, \dilation_y_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v2, \input_ptr, \next_33s1 - - EE.VMULAS.S8.QACC.LD.IP \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset -.endm - - - -.macro tie728_s8_depthwise_conv2d_33s1_last input_v0 filter_v0 input_v1 filter_v1 input_ptr filter_ptr dilation_x_offset dilation_y_offset - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - - # EE.ZERO.QACC - - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_y_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_y_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.IP \input_v0, \input_ptr, 0 - - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - # block one cycle here - - EE.VMULAS.S8.QACC \input_v0, \filter_v0 -.endm - - - -.macro tie728_s8_depthwise_conv2d_33c1_load_args args filter_ptr dilation_x_offset dilation_y_offset next_33s1 c_div_x_1 - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - # next_33s1 = (-(filter_width - 1) * dilation_x * input_channel_with_padding - (filter_height - 1) * dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) + 16 - - l32i \filter_ptr, \args, 48 - l32i \dilation_x_offset, \args, 124 - l32i \dilation_y_offset, \args, 128 - l32i \next_33s1, \args, 132 - l32i \c_div_x_1, \args, 100 - -.endm - - - - .align 4 - .text - .global dl_tie728_s8_depthwise_conv2d_33c1 - .type dl_tie728_s8_depthwise_conv2d_33c1, @function - # .section .iram1 -dl_tie728_s8_depthwise_conv2d_33c1: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_33s1 - # a9: c_div_x_1 - # a10: mac_shift - # a11: bias_ptr - # a12: - # a13: - # a14: - # a15: - tie728_s8_depthwise_conv2d_33c1_load_args a4, a5, a6, a7, a8, a9 - - l32i a10, a4, 64 # mac shift - l32i a11, a4, 68 # bias - - EE.VLD.128.XP q0, a3, a6 - EE.VLD.128.IP q1, a5, 16 - EE.VLD.128.XP q2, a3, a6 - - blti a10, 0, dl_tie728_s8_depthwise_conv2d_per_channel_33c1 - - -dl_tie728_s8_depthwise_conv2d_per_layer_33c1: - - beqz a11, tie728_s8_depthwise_conv2d_per_layer_33c1_no_bias - - loopgtz a9, tie728_s8_depthwise_conv2d_per_layer_33c1_bias_loop - EE.ZERO.QACC - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s8_vector_round_result q3, a10, a15, q6 - EE.VST.128.IP q3, a2, 16 - tie728_s8_depthwise_conv2d_per_layer_33c1_bias_loop: - - EE.ZERO.QACC - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s8_vector_round_result q3, a10, a15, q6 - EE.VST.128.IP q3, a2, 16 - retw - - - tie728_s8_depthwise_conv2d_per_layer_33c1_no_bias: - loopgtz a9, tie728_s8_depthwise_conv2d_per_layer_33c1_no_bias_loop - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s8_vector_round_result q3, a10, a15, q6 - EE.VST.128.IP q3, a2, 16 - tie728_s8_depthwise_conv2d_per_layer_33c1_no_bias_loop: - - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s8_vector_round_result q3, a10, a15, q6 - EE.VST.128.IP q3, a2, 16 - retw - - -dl_tie728_s8_depthwise_conv2d_per_channel_33c1: - l32i a10, a4, 104 # filter_channel_factor address - - beqz a11, tie728_s8_depthwise_conv2d_per_channel_33c1_no_bias - - loopgtz a9, tie728_s8_depthwise_conv2d_per_channel_33c1_bias_loop - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - # tie728_s8_conv2d_per_channel_result q3, q4, a10, a15, q5 - # tie728_s8_conv2d_bias q3, q4, a11 - tie728_s8_conv2d_per_channel_with_bias_result q3, q4, a10, a11, a15, q5 - EE.VST.128.IP q3, a2, 16 - tie728_s8_depthwise_conv2d_per_channel_33c1_bias_loop: - - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, a5, a6, a7 - # tie728_s8_conv2d_per_channel_result q3, q4, a10, a15, q5 - # tie728_s8_conv2d_bias q3, q4, a11 - tie728_s8_conv2d_per_channel_with_bias_result q3, q4, a10, a11, a15, q5 - EE.VST.128.IP q3, a2, 16 - retw - - - tie728_s8_depthwise_conv2d_per_channel_33c1_no_bias: - loopgtz a9, tie728_s8_depthwise_conv2d_per_channel_33c1_no_bias_loop - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s8_conv2d_per_channel_result q3, q4, a10, a15, q5 - EE.VST.128.IP q3, a2, 16 - tie728_s8_depthwise_conv2d_per_channel_33c1_no_bias_loop: - - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s8_conv2d_per_channel_result q3, q4, a10, a15, q5 - EE.VST.128.IP q3, a2, 16 - retw - - - - .align 4 - .text - .global dl_tie728_s8_depthwise_conv2d_33c1_relu - .type dl_tie728_s8_depthwise_conv2d_33c1_relu, @function - # .section .iram1 -dl_tie728_s8_depthwise_conv2d_33c1_relu: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_33s1 - # a9: c_div_x_1 - # a10: mac_shift - # a11: bias_ptr - # a12: activation_alpha/_address - # a13: activation_shift - # a14: - # a15: - tie728_s8_depthwise_conv2d_33c1_load_args a4, a5, a6, a7, a8, a9 - - l32i a10, a4, 64 # mac shift - l32i a11, a4, 68 # bias - l32i a12, a4, 76 # activation_alpha - l32i a13, a4, 84 # activation_shift - - EE.VLD.128.XP q0, a3, a6 - EE.VLD.128.IP q1, a5, 16 - EE.VLD.128.XP q2, a3, a6 - - blti a10, 0, dl_tie728_s8_depthwise_conv2d_per_channel_33c1_relu - - -dl_tie728_s8_depthwise_conv2d_per_layer_33c1_relu: - - beqz a11, tie728_s8_depthwise_conv2d_per_layer_33c1_no_bias_relu - - loopgtz a9, tie728_s8_depthwise_conv2d_per_layer_33c1_bias_relu_loop - EE.ZERO.QACC - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s8_vector_round_result q3, a10, a15, q6 - tie728_s8_conv2d_relu q3, a12, a13 - EE.VST.128.IP q3, a2, 16 - tie728_s8_depthwise_conv2d_per_layer_33c1_bias_relu_loop: - - EE.ZERO.QACC - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s8_vector_round_result q3, a10, a15, q6 - tie728_s8_conv2d_relu q3, a12, a13 - EE.VST.128.IP q3, a2, 16 - retw - - - tie728_s8_depthwise_conv2d_per_layer_33c1_no_bias_relu: - loopgtz a9, tie728_s8_depthwise_conv2d_per_layer_33c1_no_bias_relu_loop - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s8_vector_round_result q3, a10, a15, q6 - tie728_s8_conv2d_relu q3, a12, a13 - EE.VST.128.IP q3, a2, 16 - tie728_s8_depthwise_conv2d_per_layer_33c1_no_bias_relu_loop: - - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s8_vector_round_result q3, a10, a15, q6 - tie728_s8_conv2d_relu q3, a12, a13 - EE.VST.128.IP q3, a2, 16 - retw - - -dl_tie728_s8_depthwise_conv2d_per_channel_33c1_relu: - l32i a10, a4, 104 # filter_channel_factor address - - beqz a11, tie728_s8_depthwise_conv2d_per_channel_33c1_no_bias_relu - - loopgtz a9, tie728_s8_depthwise_conv2d_per_channel_33c1_bias_relu_loop - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - # tie728_s8_conv2d_per_channel_result q3, q4, a10, a15, q5 - # tie728_s8_conv2d_bias_relu q3, q4, a11, a12, a13 - tie728_s8_conv2d_per_channel_with_bias_result q3, q4, a10, a11, a15, q5 - tie728_s8_conv2d_relu q3, a12, a13 - EE.VST.128.IP q3, a2, 16 - tie728_s8_depthwise_conv2d_per_channel_33c1_bias_relu_loop: - - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, a5, a6, a7 - # tie728_s8_conv2d_per_channel_result q3, q4, a10, a15, q5 - # tie728_s8_conv2d_bias_relu q3, q4, a11, a12, a13 - tie728_s8_conv2d_per_channel_with_bias_result q3, q4, a10, a11, a15, q5 - tie728_s8_conv2d_relu q3, a12, a13 - EE.VST.128.IP q3, a2, 16 - retw - - - tie728_s8_depthwise_conv2d_per_channel_33c1_no_bias_relu: - loopgtz a9, tie728_s8_depthwise_conv2d_per_channel_33c1_no_bias_relu_loop - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s8_conv2d_per_channel_result q3, q4, a10, a15, q5 - tie728_s8_conv2d_relu q3, a12, a13 - EE.VST.128.IP q3, a2, 16 - tie728_s8_depthwise_conv2d_per_channel_33c1_no_bias_relu_loop: - - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s8_conv2d_per_channel_result q3, q4, a10, a15, q5 - tie728_s8_conv2d_relu q3, a12, a13 - EE.VST.128.IP q3, a2, 16 - retw - - - - .align 4 - .text - .global dl_tie728_s8_depthwise_conv2d_33c1_prelu - .type dl_tie728_s8_depthwise_conv2d_33c1_prelu, @function - # .section .iram1 -dl_tie728_s8_depthwise_conv2d_33c1_prelu: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_33s1 - # a9: c_div_x_1 - # a10: mac_shift - # a11: bias_ptr - # a12: activation_alpha/_address - # a13: activation_shift - # a14: - # a15: - tie728_s8_depthwise_conv2d_33c1_load_args a4, a5, a6, a7, a8, a9 - - l32i a10, a4, 64 # mac shift - l32i a11, a4, 68 # bias - l32i a12, a4, 80 # activation_alpha_ptr - l32i a13, a4, 84 # activation_shift - - EE.VLD.128.XP q0, a3, a6 - EE.VLD.128.IP q1, a5, 16 - EE.VLD.128.XP q2, a3, a6 - - blti a10, 0, dl_tie728_s8_depthwise_conv2d_per_channel_33c1_prelu - - -dl_tie728_s8_depthwise_conv2d_per_layer_33c1_prelu: - - beqz a11, tie728_s8_depthwise_conv2d_per_layer_33c1_no_bias_prelu - - loopgtz a9, tie728_s8_depthwise_conv2d_per_layer_33c1_bias_prelu_loop - EE.ZERO.QACC - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s8_vector_round_result q3, a10, a15, q6 - tie728_s8_conv2d_prelu q3, q5, a12, a13 - EE.VST.128.IP q3, a2, 16 - tie728_s8_depthwise_conv2d_per_layer_33c1_bias_prelu_loop: - - EE.ZERO.QACC - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s8_vector_round_result q3, a10, a15, q6 - tie728_s8_conv2d_prelu q3, q5, a12, a13 - EE.VST.128.IP q3, a2, 16 - retw - - - tie728_s8_depthwise_conv2d_per_layer_33c1_no_bias_prelu: - loopgtz a9, tie728_s8_depthwise_conv2d_per_layer_33c1_no_bias_prelu_loop - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s8_vector_round_result q3, a10, a15, q6 - tie728_s8_conv2d_prelu q3, q5, a12, a13 - EE.VST.128.IP q3, a2, 16 - tie728_s8_depthwise_conv2d_per_layer_33c1_no_bias_prelu_loop: - - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s8_vector_round_result q3, a10, a15, q6 - tie728_s8_conv2d_prelu q3, q5, a12, a13 - EE.VST.128.IP q3, a2, 16 - retw - - -dl_tie728_s8_depthwise_conv2d_per_channel_33c1_prelu: - l32i a10, a4, 104 # filter_channel_factor address - - beqz a11, tie728_s8_depthwise_conv2d_per_channel_33c1_no_bias_prelu - - loopgtz a9, tie728_s8_depthwise_conv2d_per_channel_33c1_bias_prelu_loop - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - # tie728_s8_conv2d_per_channel_result q3, q4, a10, a15, q5 - # tie728_s8_conv2d_bias_prelu q3, q4, a11, q5, a12, a13 - tie728_s8_conv2d_per_channel_with_bias_result q3, q4, a10, a11, a15, q5 - tie728_s8_conv2d_prelu q3, q5, a12, a13 - EE.VST.128.IP q3, a2, 16 - tie728_s8_depthwise_conv2d_per_channel_33c1_bias_prelu_loop: - - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, a5, a6, a7 - # tie728_s8_conv2d_per_channel_result q3, q4, a10, a15, q5 - # tie728_s8_conv2d_bias_prelu q3, q4, a11, q5, a12, a13 - tie728_s8_conv2d_per_channel_with_bias_result q3, q4, a10, a11, a15, q5 - tie728_s8_conv2d_prelu q3, q5, a12, a13 - EE.VST.128.IP q3, a2, 16 - retw - - - tie728_s8_depthwise_conv2d_per_channel_33c1_no_bias_prelu: - loopgtz a9, tie728_s8_depthwise_conv2d_per_channel_33c1_no_bias_prelu_loop - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s8_conv2d_per_channel_result q3, q4, a10, a15, q5 - tie728_s8_conv2d_prelu q3, q4, a12, a13 - EE.VST.128.IP q3, a2, 16 - tie728_s8_depthwise_conv2d_per_channel_33c1_no_bias_prelu_loop: - - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, a5, a6, a7 - tie728_s8_conv2d_per_channel_result q3, q4, a10, a15, q5 - tie728_s8_conv2d_prelu q3, q4, a12, a13 - EE.VST.128.IP q3, a2, 16 - retw - - - - - - - - - - - -.macro tie728_s8_unaligned_depthwise_conv2d_33s1 input_v0 input_v1 input_v2 input_back_aligned input_ptr filter_v0 filter_v1 filter_v2 filter_ptr dilation_x_offset dilation_y_offset next_33s1 - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - # next_33s1 = (-(filter_width - 1) * dilation_x * input_channel_with_padding - (filter_height - 1) * dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) + 16 - - # EE.ZERO.QACC - - # EE.LD.128.USAR.IP \input_v2, \input_ptr, 16 - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_y_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v2, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - EE.SRC.Q.LD.IP \input_v2, \input_ptr, 16, \input_v1, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_y_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v2, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - EE.SRC.Q.LD.IP \input_v2, \input_ptr, 16, \input_v1, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \next_33s1 - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v2, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v2, \filter_ptr, 16, \input_v1, \filter_v1 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 - EE.SRC.Q.LD.IP \input_v2, \input_ptr, 16, \input_v1, \input_back_aligned -.endm - - -.macro tie728_s8_unaligned_depthwise_conv2d_33s1_last input_v0 input_v1 input_v2 input_back_aligned input_ptr filter_v0 filter_v1 filter_ptr dilation_x_offset dilation_y_offset next_33s1 - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - - # EE.ZERO.QACC - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_y_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v2, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v2, \filter_v0 - EE.SRC.Q.LD.IP \input_v2, \input_ptr, 16, \input_v1, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_y_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v0, \filter_v1 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v2, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v1, \filter_v0 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v1 - EE.SRC.Q.LD.IP \input_v2, \input_ptr, 16, \input_v1, \input_back_aligned - - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \next_33s1 - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q \input_v2, \input_v2, \input_back_aligned - - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - - EE.VMULAS.S8.QACC \input_v2, \filter_v0 -.endm - - -.macro tie728_s8_depthwise_conv2d_unaligned_c_slice_updatex input_v0 input_front_aligned input_back_aligned input_ptr filter_v0 filter_front_aligned filter_back_aligned filter_ptr dilation_x_offset remainder_c - EE.LD.128.USAR.XP \input_front_aligned, \input_ptr, \remainder_c - # rur.sar_byte \input_sar #input sar - # EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - # wur.sar_byte \input_sar #input sar - EE.VLD.128.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.SRC.Q \input_v0, \input_front_aligned, \input_back_aligned - - EE.LD.128.USAR.XP \filter_front_aligned, \filter_ptr, \remainder_c - # rur.sar_byte \filter_sar #filter sar - # EE.LD.128.USAR.IP \filter_back_aligned, \filter_ptr, 0 - # wur.sar_byte \filter_sar #filter sar - EE.VLD.128.IP \filter_back_aligned, \filter_ptr, 0 - EE.SRC.Q \filter_v0, \filter_front_aligned, \filter_back_aligned - - EE.VMULAS.S8.QACC \input_v0, \filter_v0 -.endm - - -.macro tie728_s8_depthwise_conv2d_unaligned_c_slice_updatey input_v0 input_front_aligned input_back_aligned input_ptr filter_v0 filter_front_aligned filter_back_aligned filter_ptr dilation_y_offset remainder_c - # input remainder is in one 128bit then input_back_aligned = input_front_aligned - EE.LD.128.USAR.XP \input_front_aligned, \input_ptr, \remainder_c - # rur.sar_byte \input_sar #input sar - # EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_y_offset - # wur.sar_byte \input_sar #input sar - EE.VLD.128.XP \input_back_aligned, \input_ptr, \dilation_y_offset - EE.SRC.Q \input_v0, \input_front_aligned, \input_back_aligned - - EE.LD.128.USAR.XP \filter_front_aligned, \filter_ptr, \remainder_c - # rur.sar_byte \filter_sar #filter sar - # EE.LD.128.USAR.IP \filter_back_aligned, \filter_ptr, 0 - # wur.sar_byte \filter_sar #filter sar - EE.VLD.128.IP \filter_back_aligned, \filter_ptr, 0 - EE.SRC.Q \filter_v0, \filter_front_aligned, \filter_back_aligned - - EE.VMULAS.S8.QACC \input_v0, \filter_v0 -.endm - - -.macro tie728_s8_depthwise_conv2d_33s1_c_remainder input_v0 input_front_aligned input_back_aligned filter_v0 filter_front_aligned filter_back_aligned input_ptr filter_ptr dilation_x_offset dilation_y_offset remainder_c - # dilation_x_offset = input_channel_with_padding * dilation_x * remainder_c - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * remainder_c - - # EE.ZERO.QACC - - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatex \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_x_offset, \remainder_c - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatex \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_x_offset, \remainder_c - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatey \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_y_offset, \remainder_c - - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatex \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_x_offset, \remainder_c - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatex \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_x_offset, \remainder_c - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatey \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_y_offset, \remainder_c - - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatex \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_x_offset, \remainder_c - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatex \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_x_offset, \remainder_c - - EE.LD.128.USAR.XP \input_front_aligned, \input_ptr, \remainder_c - # rur.sar_byte \input_sar #input sar - # EE.LD.128.USAR.IP \input_back_aligned, \input_ptr, 0 - # wur.sar_byte \input_sar #input sar - EE.VLD.128.IP \input_back_aligned, \input_ptr, 0 - EE.SRC.Q \input_v0, \input_front_aligned, \input_back_aligned - - EE.LD.128.USAR.XP \filter_front_aligned, \filter_ptr, \remainder_c - # rur.sar_byte \filter_sar #filter sar - # EE.LD.128.USAR.IP \filter_back_aligned, \filter_ptr, 0 - # wur.sar_byte \filter_sar #filter sar - EE.VLD.128.IP \filter_back_aligned, \filter_ptr, 0 - EE.SRC.Q \filter_v0, \filter_front_aligned, \filter_back_aligned - - EE.VMULAS.S8.QACC \input_v0, \filter_v0 - -.endm - - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_depthwise_conv2d_33c1 - .type dl_tie728_s8_unaligned_depthwise_conv2d_33c1, @function - # .section .iram1 -dl_tie728_s8_unaligned_depthwise_conv2d_33c1: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_33s1 / remainder_c - # a9: c_div_x_1 - # a10: mac_shift - # a11: bias_ptr - # a12: activation_alpha/_address - # a13: activation_shift - # a14: - # a15: operation_type / - tie728_s8_depthwise_conv2d_33c1_load_args a4, a5, a6, a7, a8, a9 - - - l32i a10, a4, 64 # mac_shift - l32i a11, a4, 68 # bias - #l32i a12, a4, 76 # activation_alpha - l32i a13, a4, 84 # activation_shift - l32i a12, a4, 80 # activation_alpha_ptr - - tie728_s8_unaligned_conv2d_operation_type a15, a10, a11, a13, a12, a4 - - addi a6, a6, -16 - addi a7, a7, -16 - - l32i a14, a4, 4 - blti a14, 16, tie728_s8_unaligned_depthwise_conv2d_33c1_c_loop_end # input_channel < 16 - - EE.LD.128.USAR.IP q2, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - EE.VLD.128.IP q4, a5, 16 # filter_v0 - EE.SRC.Q.QUP q0, q2, q3 # input_v0 - - - EE.LD.128.USAR.IP q2, a3, 16 - EE.LD.128.USAR.XP q3, a3, a6 - addi a8, a8, -16 - EE.SRC.Q.QUP q1, q2, q3 # input_v1 - - EE.LD.128.USAR.IP q2, a3, 16 # input_v2 - - EE.LD.128.USAR.IP q6, a2, 0 #get output_ptr sar_byte - rur.sar_byte a14 - - beqi a14, 0, tie728_s8_unaligned_depthwise_conv2d_33c1_0 - beqi a14, 8, tie728_s8_unaligned_depthwise_conv2d_33c1_8 - - -# output sar_byte != 0 && != 8 - beqi a9, 0, tie728_s8_unaligned_depthwise_conv2d_33c1_loop_last - - - - tie728_s8_unaligned_depthwise_conv2d_33c1_loop: - # loopgtz a9, tie728_s8_unaligned_depthwise_conv2d_33c1_loop # Internal error in istack_push_space - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_unaligned_depthwise_conv2d_33c1_loop_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_unaligned_depthwise_conv2d_33c1_loop_no_preload_bias: - - tie728_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, a3, q4, q5, q6, a5, a6, a7, a8 - tie728_s8_conv2d_1_1_unaligned_c_result a15, q3, a10, a11, a12, a13, a14, q5, q6 - # tie728_s8_conv2d_per_layer_result q3, a10 - - #store to unaligned address - dl_tie728_s8_unaligned_store0 q3, a2, a14 - - addi a9, a9, -1 - bnez a9, tie728_s8_unaligned_depthwise_conv2d_33c1_loop - -tie728_s8_unaligned_depthwise_conv2d_33c1_loop_last: - - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_unaligned_depthwise_conv2d_33c1_loop_last_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_unaligned_depthwise_conv2d_33c1_loop_last_no_preload_bias: - - tie728_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, q4, q5, a5, a6, a7, a8 - tie728_s8_conv2d_1_1_unaligned_c_result a15, q3, a10, a11, a12, a13, a14, q5, q6 - - #store to unaligned address - dl_tie728_s8_unaligned_store0 q3, a2, a14 - - j tie728_s8_unaligned_depthwise_conv2d_33c1_c_loop_end - - -# output sar_byte == 0 -tie728_s8_unaligned_depthwise_conv2d_33c1_0: - - beqi a9, 0, tie728_s8_unaligned_depthwise_conv2d_33c1_loop0_last - - tie728_s8_unaligned_depthwise_conv2d_33c1_loop0: - # loopgtz a9, tie728_s8_unaligned_depthwise_conv2d_33c1_loop0 - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_unaligned_depthwise_conv2d_33c1_loop0_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_unaligned_depthwise_conv2d_33c1_loop0_no_preload_bias: - - tie728_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, a3, q4, q5, q6, a5, a6, a7, a8 - tie728_s8_conv2d_1_1_unaligned_c_result a15, q3, a10, a11, a12, a13, a14, q5, q6 - - EE.VST.128.IP q3, a2, 16 - # tie728_s8_unaligned_depthwise_conv2d_33c1_loop0: - addi a9, a9, -1 - bnez a9, tie728_s8_unaligned_depthwise_conv2d_33c1_loop0 - -tie728_s8_unaligned_depthwise_conv2d_33c1_loop0_last: - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_unaligned_depthwise_conv2d_33c1_loop0_last_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_unaligned_depthwise_conv2d_33c1_loop0_last_no_preload_bias: - - tie728_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, q4, q5, a5, a6, a7, a8 - tie728_s8_conv2d_1_1_unaligned_c_result a15, q3, a10, a11, a12, a13, a14, q5, q6 - EE.VST.128.IP q3, a2, 16 - - j tie728_s8_unaligned_depthwise_conv2d_33c1_c_loop_end - -# output sar_byte == 8 -tie728_s8_unaligned_depthwise_conv2d_33c1_8: - - beqi a9, 0, tie728_s8_unaligned_depthwise_conv2d_33c1_loop8_last - tie728_s8_unaligned_depthwise_conv2d_33c1_loop8: - - # loopgtz a9, tie728_s8_unaligned_depthwise_conv2d_33c1_loop8 - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_unaligned_depthwise_conv2d_33c1_loop8_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_unaligned_depthwise_conv2d_33c1_loop8_no_preload_bias: - - tie728_s8_unaligned_depthwise_conv2d_33s1 q0, q1, q2, q3, a3, q4, q5, q6, a5, a6, a7, a8 - tie728_s8_conv2d_1_1_unaligned_c_result a15, q3, a10, a11, a12, a13, a14, q5, q6 - - #store to unaligned address - dl_tie728_s8_unaligned_store1 q3, a2 - addi a9, a9, -1 - # tie728_s8_unaligned_depthwise_conv2d_33c1_loop8: - bnez a9, tie728_s8_unaligned_depthwise_conv2d_33c1_loop8 - -tie728_s8_unaligned_depthwise_conv2d_33c1_loop8_last: - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_unaligned_depthwise_conv2d_33c1_loop8_last_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_unaligned_depthwise_conv2d_33c1_loop8_last_no_preload_bias: - - tie728_s8_unaligned_depthwise_conv2d_33s1_last q0, q1, q2, q3, a3, q4, q5, a5, a6, a7, a8 - tie728_s8_conv2d_1_1_unaligned_c_result a15, q3, a10, a11, a12, a13, a14, q5, q6 - dl_tie728_s8_unaligned_store1 q3, a2 - - -tie728_s8_unaligned_depthwise_conv2d_33c1_c_loop_end: - - l32i a8, a4, 136 # c_remainder - - beqz a8, dl_tie728_s8_unaligned_depthwise_conv2d_33c1_end - # mov a9, a15 #operation_type - - addi a6, a6, 16 - addi a7, a7, 16 - - sub a6, a6, a8 - sub a7, a7, a8 - - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a11, tie728_s8_unaligned_depthwise_conv2d_33c1_c_loop_end_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a11 - tie728_s8_unaligned_depthwise_conv2d_33c1_c_loop_end_no_preload_bias: - - tie728_s8_depthwise_conv2d_33s1_c_remainder q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8 - tie728_s8_conv2d_1_1_unaligned_c_result a15, q3, a10, a11, a12, a13, a14, q4, q5 - - # store low remainder_c part - dl_tie728_s8_store_remainder q3, a10, a11, a12, a13, a2, a8 - -dl_tie728_s8_unaligned_depthwise_conv2d_33c1_end: - retw - - - - - - -############################################################################################################################################################ -#### -#### tie728_s8_depthwise_conv2d_hwc1 series -#### -############################################################################################################################################################ - - -.macro tie728_s8_depthwise_conv2d_1ws1 input_v0 input_v1 input_v2 filter_v0 filter_v1 filter_v2 input_ptr filter_ptr dilation_x_offset dilation_y_offset filter_h filter_w filter_w_rs1_1 filter_y_offset - loopgtz \filter_w_rs1_1, 1f - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - 1: - bbci \filter_w, 0, 2f - # three 8-input-element left - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v2, \input_ptr, \dilation_y_offset - - EE.VMULAS.S8.QACC.LD.XP \filter_v2, \filter_ptr, \filter_y_offset, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v2, \filter_v2 # block one cyle here - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - j 3f -2: # two 8-input-element left - EE.VMULAS.S8.QACC.LD.XP \filter_v1, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - sub \input_ptr, \input_ptr, \dilation_x_offset - add \input_ptr, \input_ptr, \dilation_y_offset - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 # block one cyle here - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset -3: -.endm - - - - -.macro tie728_s8_depthwise_conv2d_1ws1_last input_v0 input_v1 filter_v0 filter_v1 input_ptr filter_ptr dilation_x_offset dilation_y_offset filter_h filter_w filter_w_rs1_1 next_hws1 filter_y_offset - loopgtz \filter_w_rs1_1, 4f - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v1 - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - 4: - bbci \filter_w, 0, 5f - # three 8-input-element left - EE.VMULAS.S8.QACC.LD.IP \filter_v1, \filter_ptr, 16, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \next_hws1 - - EE.VMULAS.S8.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v1, \filter_v1 - # block one cyle here - EE.VMULAS.S8.QACC \input_v0, \filter_v0 - j 6f -5: # two 8-input-element left - EE.VMULAS.S8.QACC.LD.XP \filter_v1, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - sub \input_ptr, \input_ptr, \dilation_x_offset - add \input_ptr, \input_ptr, \next_hws1 - EE.VMULAS.S8.QACC \input_v1, \filter_v1 -6: - -.endm - - - -.macro tie728_s8_depthwise_conv2d_hws1 input_v0 input_v1 input_v2 filter_v0 filter_v1 filter_v2 input_ptr filter_ptr dilation_x_offset dilation_y_offset next_hws1 filter_h filter_w filter_w_rs1_1 args filter_offset_q filter_y_offset - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - # next_hws1 = (-(filter_width - 1) * dilation_x * input_channel_with_padding - (filter_height - 1) * dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) + 16 - # filter_w_rs1_1 - - - # EE.ZERO.QACC - - l32i \filter_h, \args, 52 # filter_height - - blti \filter_w, 2, 9f - EE.VLD.128.IP \filter_v0, \filter_ptr, 16 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_x_offset - EE.VLD.128.XP \input_v1, \input_ptr, \dilation_x_offset - blti \filter_h, 2, 8f - 7: - tie728_s8_depthwise_conv2d_1ws1 \input_v0, \input_v1, \input_v2, \filter_v0, \filter_v1, \filter_v2, \input_ptr, \filter_ptr, \dilation_x_offset, \dilation_y_offset, \filter_h, \filter_w, \filter_w_rs1_1, \filter_y_offset - addi \filter_h, \filter_h, -1 - bgei \filter_h, 2, 7b - 8: # last y - tie728_s8_depthwise_conv2d_1ws1_last \input_v0, \input_v1, \filter_v0, \filter_v1, \input_ptr, \filter_ptr, \dilation_x_offset, \dilation_y_offset, \filter_h, \filter_w, \filter_w_rs1_1, \next_hws1, \filter_y_offset - j 12f - - - # filter_w == 1 - 9: - - EE.VLD.128.XP \filter_v0, \filter_ptr, \filter_y_offset - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_y_offset - blti \filter_h, 2, 11f - 10: - EE.VMULAS.S8.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - EE.VLD.128.XP \input_v0, \input_ptr, \dilation_y_offset - addi \filter_h, \filter_h, -1 - bgei \filter_h, 2, 10b - 11: # last y - EE.VMULAS.S8.QACC \input_v0, \filter_v0 - sub \input_ptr, \input_ptr, \dilation_y_offset - add \input_ptr, \input_ptr, \next_hws1 - -12: - EE.MOVI.32.A \filter_offset_q, \filter_h, 2 - add \filter_ptr, \filter_ptr, \filter_h -.endm - - - - -.macro tie728_s8_depthwise_conv2d_hwc1_load_args args filter_ptr dilation_x_offset dilation_y_offset next_hws1 c_div_x_1 filter_w filter_w_rs1_1 - tie728_s8_depthwise_conv2d_33c1_load_args \args, \filter_ptr, \dilation_x_offset, \dilation_y_offset, \next_hws1, \c_div_x_1 - l32i \filter_w, \args, 56 - l32i \filter_w_rs1_1, \args, 148 - -.endm - - - - - .align 4 - .text - .global dl_tie728_s8_depthwise_conv2d_hwc1 - .type dl_tie728_s8_depthwise_conv2d_hwc1, @function - # .section .iram1 -dl_tie728_s8_depthwise_conv2d_hwc1: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hws1 - # a9: c_div_x_1 - # a10: mac_shift - # a11: filter_h - # a12: filter_w - # a13: filter_w_rs1_1 - # a14: bias_ptr - # a15: - - l32i a15, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a11, 2 - - tie728_s8_depthwise_conv2d_hwc1_load_args a4, a5, a6, a7, a8, a9, a12, a13 - - l32i a10, a4, 64 # mac shift - l32i a14, a4, 68 # bias - - blti a10, 0, dl_tie728_s8_depthwise_conv2d_per_channel_hwc1 - - -dl_tie728_s8_depthwise_conv2d_per_layer_hwc1: - - beqz a14, tie728_s8_depthwise_conv2d_per_layer_hwc1_no_bias - - tie728_s8_depthwise_conv2d_per_layer_hwc1_bias_loop: - EE.ZERO.QACC - tie728_s8_conv2d_128b_vector_bias a14 - tie728_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - tie728_s8_vector_round_result q3, a10, a11, q6 - EE.VST.128.IP q3, a2, 16 - addi a9, a9, -1 - bgez a9, tie728_s8_depthwise_conv2d_per_layer_hwc1_bias_loop - retw - - - tie728_s8_depthwise_conv2d_per_layer_hwc1_no_bias: - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - tie728_s8_vector_round_result q3, a10, a11, q6 - EE.VST.128.IP q3, a2, 16 - addi a9, a9, -1 - bgez a9, tie728_s8_depthwise_conv2d_per_layer_hwc1_no_bias - retw - - -dl_tie728_s8_depthwise_conv2d_per_channel_hwc1: - l32i a10, a4, 104 # filter_channel_factor address - - beqz a14, tie728_s8_depthwise_conv2d_per_channel_hwc1_no_bias - - tie728_s8_depthwise_conv2d_per_channel_hwc1_bias_loop: - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - # tie728_s8_conv2d_per_channel_result q3, q4, a10, a11, q5 - # tie728_s8_conv2d_bias q3, q4, a14 - tie728_s8_conv2d_per_channel_with_bias_result q3, q4, a10, a14, a11, q5 - EE.VST.128.IP q3, a2, 16 - addi a9, a9, -1 - bgez a9, tie728_s8_depthwise_conv2d_per_channel_hwc1_bias_loop - retw - - - tie728_s8_depthwise_conv2d_per_channel_hwc1_no_bias: - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - tie728_s8_conv2d_per_channel_result q3, q4, a10, a11, q5 - EE.VST.128.IP q3, a2, 16 - addi a9, a9, -1 - bgez a9, tie728_s8_depthwise_conv2d_per_channel_hwc1_no_bias - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_depthwise_conv2d_hwc1_relu - .type dl_tie728_s8_depthwise_conv2d_hwc1_relu, @function - # .section .iram1 -dl_tie728_s8_depthwise_conv2d_hwc1_relu: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hws1 - # a9: c_div_x_1 - # a10: mac_shift - # a11: filter_h - # a12: filter_w - # a13: filter_w_rs1_1 - # a14: bias_ptr - # a15: activation_alpha - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - tie728_s8_depthwise_conv2d_hwc1_load_args a4, a5, a6, a7, a8, a9, a12, a13 - - l32i a10, a4, 64 # mac shift - l32i a15, a4, 76 # activation_alpha - l32i a14, a4, 68 # bias - - EE.MOVI.32.Q q7, a15, 3 - - blti a10, 0, dl_tie728_s8_depthwise_conv2d_per_channel_hwc1_relu - - -dl_tie728_s8_depthwise_conv2d_per_layer_hwc1_relu: - - beqz a14, tie728_s8_depthwise_conv2d_per_layer_hwc1_no_bias_relu - - tie728_s8_depthwise_conv2d_per_layer_hwc1_bias_relu_loop: - EE.MOVI.32.A q7, a15, 1 - EE.ZERO.QACC - tie728_s8_conv2d_128b_vector_bias a14 - tie728_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - tie728_s8_vector_round_result q3, a10, a11, q6 - l32i a11, a4, 84 # activation_shift - EE.MOVI.32.A q7, a15, 3 - tie728_s8_conv2d_relu q3, a15, a11 - EE.VST.128.IP q3, a2, 16 - addi a9, a9, -1 - bgez a9, tie728_s8_depthwise_conv2d_per_layer_hwc1_bias_relu_loop - retw - - - tie728_s8_depthwise_conv2d_per_layer_hwc1_no_bias_relu: - l32i a14, a4, 84 # activation_shift - tie728_s8_depthwise_conv2d_per_layer_hwc1_no_bias_relu_loop: - EE.MOVI.32.A q7, a15, 1 - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - tie728_s8_vector_round_result q3, a10, a11, q6 - EE.MOVI.32.A q7, a15, 3 - tie728_s8_conv2d_relu q3, a15, a14 - EE.VST.128.IP q3, a2, 16 - addi a9, a9, -1 - bgez a9, tie728_s8_depthwise_conv2d_per_layer_hwc1_no_bias_relu_loop - retw - - -dl_tie728_s8_depthwise_conv2d_per_channel_hwc1_relu: - l32i a10, a4, 104 # filter_channel_factor address - - beqz a14, tie728_s8_depthwise_conv2d_per_channel_hwc1_no_bias_relu - - tie728_s8_depthwise_conv2d_per_channel_hwc1_bias_relu_loop: - EE.MOVI.32.A q7, a15, 1 - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - # tie728_s8_conv2d_per_channel_result q3, q4, a10, a11, q5 - # l32i a11, a4, 84 # activation_shift - # EE.MOVI.32.A q7, a15, 3 - # tie728_s8_conv2d_bias_relu q3, q4, a14, a15, a11 - tie728_s8_conv2d_per_channel_with_bias_result q3, q4, a10, a14, a11, q5 - l32i a11, a4, 84 # activation_shift - EE.MOVI.32.A q7, a15, 3 - tie728_s8_conv2d_relu q3, a15, a11 - EE.VST.128.IP q3, a2, 16 - addi a9, a9, -1 - bgez a9, tie728_s8_depthwise_conv2d_per_channel_hwc1_bias_relu_loop - retw - - tie728_s8_depthwise_conv2d_per_channel_hwc1_no_bias_relu: - l32i a14, a4, 84 # activation_shift - tie728_s8_depthwise_conv2d_per_channel_hwc1_no_bias_relu_loop: - EE.MOVI.32.A q7, a15, 1 - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - tie728_s8_conv2d_per_channel_result q3, q4, a10, a11, q5 - EE.MOVI.32.A q7, a15, 3 - tie728_s8_conv2d_relu q3, a15, a14 - EE.VST.128.IP q3, a2, 16 - addi a9, a9, -1 - bgez a9, tie728_s8_depthwise_conv2d_per_channel_hwc1_no_bias_relu_loop - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_depthwise_conv2d_hwc1_prelu - .type dl_tie728_s8_depthwise_conv2d_hwc1_prelu, @function - # .section .iram1 -dl_tie728_s8_depthwise_conv2d_hwc1_prelu: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hws1 - # a9: c_div_x_1 - # a10: mac_shift - # a11: filter_h - # a12: filter_w - # a13: filter_w_rs1_1 - # a14: bias_ptr - # a15: activation_alpha_ptr - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - tie728_s8_depthwise_conv2d_hwc1_load_args a4, a5, a6, a7, a8, a9, a12, a13 - - l32i a10, a4, 64 # mac shift - l32i a15, a4, 80 # activation_alpha_ptr - l32i a14, a4, 68 # bias - EE.MOVI.32.Q q7, a15, 3 - - - blti a10, 0, dl_tie728_s8_depthwise_conv2d_per_channel_hwc1_prelu - - -dl_tie728_s8_depthwise_conv2d_per_layer_hwc1_prelu: - - beqz a14, tie728_s8_depthwise_conv2d_per_layer_hwc1_no_bias_prelu - - tie728_s8_depthwise_conv2d_per_layer_hwc1_bias_prelu_loop: - EE.MOVI.32.A q7, a15, 1 - EE.ZERO.QACC - tie728_s8_conv2d_128b_vector_bias a14 - tie728_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - tie728_s8_vector_round_result q3, a10, a11, q6 - l32i a11, a4, 84 # activation_shift - EE.MOVI.32.A q7, a15, 3 - tie728_s8_conv2d_prelu q3, q5, a15, a11 - EE.MOVI.32.Q q7, a15, 3 - EE.VST.128.IP q3, a2, 16 - addi a9, a9, -1 - bgez a9, tie728_s8_depthwise_conv2d_per_layer_hwc1_bias_prelu_loop - retw - - - tie728_s8_depthwise_conv2d_per_layer_hwc1_no_bias_prelu: - l32i a14, a4, 84 # activation_shift - tie728_s8_depthwise_conv2d_per_layer_hwc1_no_bias_prelu_loop: - EE.MOVI.32.A q7, a15, 1 - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - tie728_s8_vector_round_result q3, a10, a11, q6 - EE.MOVI.32.A q7, a15, 3 - tie728_s8_conv2d_prelu q3, q5, a15, a14 - EE.MOVI.32.Q q7, a15, 3 - EE.VST.128.IP q3, a2, 16 - addi a9, a9, -1 - bgez a9, tie728_s8_depthwise_conv2d_per_layer_hwc1_no_bias_prelu_loop - retw - - -dl_tie728_s8_depthwise_conv2d_per_channel_hwc1_prelu: - l32i a10, a4, 104 # filter_channel_factor address - - beqz a14, tie728_s8_depthwise_conv2d_per_channel_hwc1_no_bias_prelu - - tie728_s8_depthwise_conv2d_per_channel_hwc1_bias_prelu_loop: - EE.MOVI.32.A q7, a15, 1 - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - # tie728_s8_conv2d_per_channel_result q3, q4, a10, a11, q5 - tie728_s8_conv2d_per_channel_with_bias_result q3, q4, a10, a14, a11, q5 - l32i a11, a4, 84 # activation_shift - EE.MOVI.32.A q7, a15, 3 - # tie728_s8_conv2d_bias_prelu q3, q4, a14, q5, a15, a11 - tie728_s8_conv2d_prelu q3, q5, a15, a11 - EE.MOVI.32.Q q7, a15, 3 - EE.VST.128.IP q3, a2, 16 - addi a9, a9, -1 - bgez a9, tie728_s8_depthwise_conv2d_per_channel_hwc1_bias_prelu_loop - retw - - tie728_s8_depthwise_conv2d_per_channel_hwc1_no_bias_prelu: - l32i a14, a4, 84 # activation_shift - tie728_s8_depthwise_conv2d_per_channel_hwc1_no_bias_prelu_loop: - EE.MOVI.32.A q7, a15, 1 - EE.ZERO.QACC - tie728_s8_depthwise_conv2d_hws1 q0, q1, q2, q3, q4, q5, a3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - tie728_s8_conv2d_per_channel_result q3, q4, a10, a11, q5 - EE.MOVI.32.A q7, a15, 3 - tie728_s8_conv2d_prelu q3, q5, a15, a14 - EE.MOVI.32.Q q7, a15, 3 - EE.VST.128.IP q3, a2, 16 - addi a9, a9, -1 - bgez a9, tie728_s8_depthwise_conv2d_per_channel_hwc1_no_bias_prelu_loop - retw - - - - - - - -############################################################################################################################################################ -#### -#### tie728_s8_unaligned_depthwise_conv2d_hwc1 series -#### -############################################################################################################################################################ - -.macro tie728_s8_depthwise_conv2d_unaligned_c_slice_updatey_padding input_v0 input_front_aligned input_back_aligned input_ptr filter_v0 filter_front_aligned filter_back_aligned filter_ptr dilation_y_offset remainder_c filter_y_offset - EE.LD.128.USAR.XP \input_front_aligned, \input_ptr, \remainder_c - - EE.VLD.128.XP \input_back_aligned, \input_ptr, \dilation_y_offset - EE.SRC.Q \input_v0, \input_front_aligned, \input_back_aligned - - EE.LD.128.USAR.XP \filter_front_aligned, \filter_ptr, \remainder_c - - EE.VLD.128.XP \filter_back_aligned, \filter_ptr, \filter_y_offset - EE.SRC.Q \filter_v0, \filter_front_aligned, \filter_back_aligned - - EE.VMULAS.S8.QACC \input_v0, \filter_v0 -.endm - - - -.macro tie728_s8_unaligned_depthwise_conv2d_1ws1 input_v0 input_v1 input_back_aligned input_ptr filter_v0 filter_ptr dilation_x_offset dilation_y_offset filter_w filter_w_rs1_1 filter_y_offset - - loopgtz \filter_w_rs1_1, 1f - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v1, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v0 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back_aligned - 1: - bbci \filter_w, 0, 2f - - # three 8-input-element left - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v1, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_y_offset - EE.VMULAS.S8.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v1, \filter_v0 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q \input_v0, \input_v1, \input_back_aligned - EE.LD.128.USAR.IP \input_v1, \input_ptr, 16 - - j 3f -2: # two 8-input-element left - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_y_offset - EE.VMULAS.S8.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v1, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v0 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back_aligned -3: -.endm - - - -.macro tie728_s8_unaligned_depthwise_conv2d_1ws1_last input_v0 input_v1 input_back_aligned input_ptr filter_v0 filter_ptr dilation_x_offset filter_w filter_w_rs1_1 next_hws1 filter_y_offset - loopgtz \filter_w_rs1_1, 4f - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v1, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v1, \filter_v0 - EE.SRC.Q.LD.IP \input_v1, \input_ptr, 16, \input_v0, \input_back_aligned - 4: - bbci \filter_w, 0, 5f - - # three 16byte left - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VMULAS.S8.QACC.LD.IP \filter_v0, \filter_ptr, 16, \input_v0, \filter_v0 - EE.SRC.Q.LD.IP \input_v0, \input_ptr, 16, \input_v1, \input_back_aligned - - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \next_hws1 - EE.VMULAS.S8.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v1, \filter_v0 - EE.SRC.Q \input_v0, \input_v0, \input_back_aligned - - EE.VMULAS.S8.QACC \input_v0, \filter_v0 - - j 6f -5: # two 16byte left - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \next_hws1 - EE.VMULAS.S8.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - EE.SRC.Q \input_v1, \input_v1, \input_back_aligned - - EE.VMULAS.S8.QACC \input_v1, \filter_v0 -6: - -.endm - - - - -.macro tie728_s8_unaligned_depthwise_conv2d_hws1 input_v0 input_v1 input_back_aligned input_ptr filter_v0 filter_ptr dilation_x_offset dilation_y_offset next_hws1 filter_h filter_w filter_w_rs1_1 args filter_offset_q filter_y_offset - # dilation_x_offset = input_channel_with_padding * dilation_x * sizeof(T) - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) - # next_hws1 = (-(filter_width - 1) * dilation_x * input_channel_with_padding - (filter_height - 1) * dilation_y * input_width_with_padding * input_channel_with_padding) * sizeof(T) + 16 - # filter_w_rs1_1 - - - # EE.ZERO.QACC - - l32i \filter_h, \args, 52 # filter_height - - - blti \filter_w, 2, 9f - - EE.LD.128.USAR.IP \input_v1, \input_ptr, 16 - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_x_offset - EE.VLD.128.IP \filter_v0, \filter_ptr, 16 # filter_v0 - EE.SRC.Q \input_v0, \input_v1, \input_back_aligned # input_v0 - - EE.LD.128.USAR.IP \input_v1, \input_ptr, 16 - - blti \filter_h, 2, 8f - 7: - tie728_s8_unaligned_depthwise_conv2d_1ws1 \input_v0, \input_v1, \input_back_aligned, \input_ptr, \filter_v0, \filter_ptr, \dilation_x_offset, \dilation_y_offset, \filter_w, \filter_w_rs1_1 \filter_y_offset - addi \filter_h, \filter_h, -1 - bgei \filter_h, 2, 7b - 8: # last y - tie728_s8_unaligned_depthwise_conv2d_1ws1_last \input_v0, \input_v1, \input_back_aligned, \input_ptr, \filter_v0, \filter_ptr, \dilation_x_offset, \filter_w, \filter_w_rs1_1, \next_hws1 \filter_y_offset - j 12f - - - # filter_w == 1 - 9: - EE.LD.128.USAR.IP \input_v1, \input_ptr, 16 - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_y_offset - EE.VLD.128.XP \filter_v0, \filter_ptr, \filter_y_offset # filter_v0 - EE.SRC.Q \input_v0, \input_v1, \input_back_aligned # input_v0 - - - blti \filter_h, 2, 11f - addi \filter_h, \filter_h, -1 - loopgtz \filter_h, 10f - EE.LD.128.USAR.IP \input_v1, \input_ptr, 16 - EE.LD.128.USAR.XP \input_back_aligned, \input_ptr, \dilation_y_offset - EE.VMULAS.S8.QACC.LD.XP \filter_v0, \filter_ptr, \filter_y_offset, \input_v0, \filter_v0 - EE.SRC.Q \input_v0, \input_v1, \input_back_aligned - 10: - - - 11: # last y - EE.VMULAS.S8.QACC \input_v0, \filter_v0 - sub \input_ptr, \input_ptr, \dilation_y_offset - add \input_ptr, \input_ptr, \next_hws1 - -12: - EE.MOVI.32.A \filter_offset_q, \filter_h, 2 - add \filter_ptr, \filter_ptr, \filter_h -.endm - - - - -.macro tie728_s8_unaligned_depthwise_conv2d_hws1_c_remainder input_v0 input_front_aligned input_back_aligned input_ptr filter_v0 filter_front_aligned filter_back_aligned filter_ptr dilation_x_offset dilation_y_offset filter_h filter_w filter_w_rs1_1 remainder_c args filter_y_offset - # dilation_x_offset = input_channel_with_padding * dilation_x * remainder_c - # dilation_y_offset = (-(filter_width - 1) * dilation_x * input_channel_with_padding + dilation_y * input_width_with_padding * input_channel_with_padding) * remainder_c - - # EE.ZERO.QACC - - l32i \filter_h, \args, 52 # filter_height - - blti \filter_w, 2, 5f - - 4: - loopgtz \filter_w_rs1_1, 1f - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatex \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_x_offset, \remainder_c - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatex \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_x_offset, \remainder_c - 1: - bbci \filter_w, 0, 2f - # 3 left - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatex \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_x_offset, \remainder_c - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatex \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_x_offset, \remainder_c - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatey_padding \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_y_offset, \remainder_c, \filter_y_offset - j 3f - 2: - # 2 left - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatex \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_x_offset, \remainder_c - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatey_padding \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_y_offset, \remainder_c, \filter_y_offset - 3: - addi \filter_h, \filter_h, -1 - bgei \filter_h, 1, 4b - - j 7f - - - 5: - # filter_w == 1 - loopgtz \filter_h, 6f - tie728_s8_depthwise_conv2d_unaligned_c_slice_updatey_padding \input_v0, \input_front_aligned, \input_back_aligned, \input_ptr, \filter_v0, \filter_front_aligned, \filter_back_aligned, \filter_ptr, \dilation_y_offset, \remainder_c, \filter_y_offset - 6: - - -7: - -.endm - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_depthwise_conv2d_hwc1 - .type dl_tie728_s8_unaligned_depthwise_conv2d_hwc1, @function - # .section .iram1 -dl_tie728_s8_unaligned_depthwise_conv2d_hwc1: - .align 4 - entry sp, 128 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - - # a5: int8_t *filter_ptr - # a6: input dilation x offset - # a7: input dilation y offset - # a8: next_hws1 - # a9: c_div_x_1 - # a10: mac_shift - # a11: filter_h - # a12: filter_w - # a13: filter_w_rs1_1 - # a14: bias_ptr - # a15: activation_alpha_ptr - - l32i a12, a4, 60 - l32i a11, a4, 144 - EE.MOVI.32.Q q7, a12, 1 - EE.MOVI.32.Q q7, a11, 2 - - - l32i a10, a4, 64 # mac shift - l32i a14, a4, 68 # bias - # l32i a12, a4, 76 # activation_alpha - l32i a11, a4, 84 # activation_shift - l32i a15, a4, 80 # activation_alpha_ptr - - tie728_s8_unaligned_conv2d_operation_type a12, a10, a14, a11, a15, a4 - EE.MOVI.32.Q q7, a12, 0 #operation type - EE.MOVI.32.Q q7, a15, 3 - - tie728_s8_depthwise_conv2d_hwc1_load_args a4, a5, a6, a7, a8, a9, a12, a13 - - addi a6, a6, -16 - addi a7, a7, -16 - addi a8, a8, -16 - - - - # EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - # rur.sar_byte a11 - - # beqi a9, 0, tie728_s8_unaligned_depthwise_conv2d_hwc1_loop_last - - l32i a11, a4, 4 #input_channel - blti a11, 16, tie728_s8_unaligned_depthwise_conv2d_hwc1_c_loop_end #input_channel < 16 - - l32i a11, a4, 84 # activation_shift - blti a11, 0, tie728_s8_unaligned_depthwise_conv2d_hwc1_no_activation - -tie728_s8_unaligned_depthwise_conv2d_hwc1_with_activation: - tie728_s8_unaligned_depthwise_conv2d_hwc1_loop: - l32i a13, a4, 148 # filter_w_rs1_1 - l32i a12, a4, 56 # filter_w - EE.MOVI.32.A q7, a15, 1 - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a14, tie728_s8_unaligned_depthwise_conv2d_hwc1_loop_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a14 - tie728_s8_unaligned_depthwise_conv2d_hwc1_loop_no_preload_bias: - tie728_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, a3, q3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - - EE.MOVI.32.A q7, a13, 0 #operation type - l32i a12, a4, 84 # activation_shift - EE.MOVI.32.A q7, a15, 3 - tie728_s8_conv2d_1_1_unaligned_c_result a13, q4, a10, a14, a15, a12, a11, q5, q6 - EE.MOVI.32.Q q7, a15, 3 - #store to unaligned address - dl_tie728_s8_unaligned_store0 q4, a2, a11 - - addi a9, a9, -1 - bgez a9, tie728_s8_unaligned_depthwise_conv2d_hwc1_loop - - j tie728_s8_unaligned_depthwise_conv2d_hwc1_c_loop_end - - -tie728_s8_unaligned_depthwise_conv2d_hwc1_no_activation: - # EE.MOVI.32.A q7, a15, 0 #operation type - - tie728_s8_unaligned_depthwise_conv2d_hwc1_loop_no_act: - EE.MOVI.32.A q7, a15, 1 - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a14, tie728_s8_unaligned_depthwise_conv2d_hwc1_loop_no_act_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a14 - tie728_s8_unaligned_depthwise_conv2d_hwc1_loop_no_act_no_preload_bias: - tie728_s8_unaligned_depthwise_conv2d_hws1 q0, q1, q2, a3, q3, a5, a6, a7, a8, a11, a12, a13, a4, q7, a15 - EE.MOVI.32.A q7, a15, 0 - tie728_s8_conv2d_1_1_unaligned_c_result a15, q4, a10, a14, a11, a11, a11, q5, q6 - #store to unaligned address - dl_tie728_s8_unaligned_store0 q4, a2, a11 - - addi a9, a9, -1 - bgez a9, tie728_s8_unaligned_depthwise_conv2d_hwc1_loop_no_act - - j tie728_s8_unaligned_depthwise_conv2d_hwc1_c_loop_end - - - -tie728_s8_unaligned_depthwise_conv2d_hwc1_c_loop_end: - l32i a8, a4, 136 # c_remainder - - beqz a8, dl_tie728_s8_unaligned_depthwise_conv2d_hwc1_end - - l32i a12, a4, 160 - l32i a5, a4, 168 # filter_ptr unaligned - EE.MOVI.32.Q q7, a12, 1 - - addi a6, a6, 16 - addi a7, a7, 16 - sub a6, a6, a8 - sub a7, a7, a8 - - EE.MOVI.32.A q7, a9, 0 # operation type - - - l32i a13, a4, 148 # filter_w_rs1_1 - l32i a12, a4, 56 # filter_w - EE.MOVI.32.A q7, a15, 1 - EE.ZERO.QACC - - # Without modifications specifically for per-channel, there may be issues with per-channel - beqz a14, tie728_s8_unaligned_depthwise_conv2d_hwc1_c_loop_end_no_preload_bias - tie728_s8_conv2d_128b_vector_bias a14 - tie728_s8_unaligned_depthwise_conv2d_hwc1_c_loop_end_no_preload_bias: - tie728_s8_unaligned_depthwise_conv2d_hws1_c_remainder q0, q1, q2, a3, q3, q4, q5, a5, a6, a7, a11, a12, a13, a8, a4, a15 - - l32i a12, a4, 84 # activation_shift - EE.MOVI.32.A q7, a15, 3 - tie728_s8_conv2d_1_1_unaligned_c_result a9, q3, a10, a14, a15, a12, a11, q4, q5 - - # store low remainder_c part - dl_tie728_s8_store_remainder q3, a10, a11, a12, a13, a2, a8 - -dl_tie728_s8_unaligned_depthwise_conv2d_hwc1_end: - # addi a2, a2, -16 - - retw - diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_max2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_max2d.S deleted file mode 100644 index eaa41669..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_max2d.S +++ /dev/null @@ -1,578 +0,0 @@ -#include "dl_tie728_s8.S" - - -############################################################################################################################################################ -#### -#### tie728_s8_max2d_11c series -#### -############################################################################################################################################################ - - - .align 4 - .text - .global dl_tie728_s8_max2d_11c - .type dl_tie728_s8_max2d_11c, @function - .section .iram1 -dl_tie728_s8_max2d_11c: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - - - l32i a6, a5, 64 - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.VMAX.S8.LD.INCP q0, a3, q2, q0, q1 - EE.VLD.128.IP q1, a4, 16 - EE.VST.128.IP q2, a2, 16 - 0: - - EE.VMAX.S8 q2, q0, q1 - EE.VST.128.IP q2, a2, 16 - - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_max2d_11c_relu - .type dl_tie728_s8_max2d_11c_relu, @function - .section .iram1 -dl_tie728_s8_max2d_11c_relu: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a14: activation_alpha - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a14, a5, 52 - l32i a15, a5, 60 - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.VMAX.S8.LD.INCP q0, a3, q2, q0, q1 - EE.VLD.128.IP q1, a4, 16 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - 0: - - EE.VMAX.S8 q2, q0, q1 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - - retw - - - - - .align 4 - .text - .global dl_tie728_s8_max2d_11c_prelu - .type dl_tie728_s8_max2d_11c_prelu, @function - .section .iram1 -dl_tie728_s8_max2d_11c_prelu: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a14: activation_alpha_ptr - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 76 - l32i a14, a5, 56 - l32i a15, a5, 60 - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.VMAX.S8.LD.INCP q0, a3, q2, q0, q1 - EE.VLD.128.IP q3, a14, 16 - EE.VLD.128.IP q1, a4, 16 - EE.VPRELU.S8 q2, q2, q3, a15 - EE.VST.128.IP q2, a2, 16 - 0: - - EE.VLD.128.IP q3, a14, 16 - EE.VMAX.S8 q2, q0, q1 - EE.VPRELU.S8 q2, q2, q3, a15 - EE.VST.128.IP q2, a2, 16 - - retw - - - - - -############################################################################################################################################################ -#### -#### tie728_s8_unaligned_max2d_11c series -#### -############################################################################################################################################################ - .align 4 - .text - .global dl_tie728_s8_unaligned_max2d_11c - .type dl_tie728_s8_unaligned_max2d_11c, @function - .section .iram1 -dl_tie728_s8_unaligned_max2d_11c: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: c_remainder - - - l32i a6, a5, 64 - l32i a7, a5, 76 - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie718_s8_unaligned_max2d_11c_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_s8_unaligned_max2d_11c_0 - beqi a13, 8, dl_tie718_s8_unaligned_max2d_11c_1 - - - loopgtz a6, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie718_s8_unaligned_max2d_11c_remainder - -dl_tie718_s8_unaligned_max2d_11c_0: - - loopgtz a6, 1f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_s8_unaligned_max2d_11c_remainder - -dl_tie718_s8_unaligned_max2d_11c_1: - - loopgtz a6, 2f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - dl_tie728_s8_unaligned_store1 q2, a2 - - j dl_tie718_s8_unaligned_max2d_11c_remainder - -dl_tie718_s8_unaligned_max2d_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a7 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a7 - rur.sar_byte a12 - -dl_tie718_s8_unaligned_max2d_11c_remainder: - - - beqz a7, dl_tie728_s8_unaligned_max2d_11c_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a7 - -dl_tie728_s8_unaligned_max2d_11c_end: - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_max2d_11c_relu - .type dl_tie728_s8_unaligned_max2d_11c_relu, @function - .section .iram1 -dl_tie728_s8_unaligned_max2d_11c_relu: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: c_remainder - # a14: activation_alpha - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 76 - l32i a14, a5, 52 - l32i a15, a5, 60 - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie718_s8_unaligned_max2d_11c_relu_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_s8_unaligned_max2d_11c_relu_0 - beqi a13, 8, dl_tie718_s8_unaligned_max2d_11c_relu_1 - - - loopgtz a6, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie718_s8_unaligned_max2d_11c_relu_remainder - -dl_tie718_s8_unaligned_max2d_11c_relu_0: - - loopgtz a6, 1f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_s8_unaligned_max2d_11c_relu_remainder - -dl_tie718_s8_unaligned_max2d_11c_relu_1: - - loopgtz a6, 2f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - - j dl_tie718_s8_unaligned_max2d_11c_relu_remainder - -dl_tie718_s8_unaligned_max2d_11c_relu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a7 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a7 - rur.sar_byte a12 - -dl_tie718_s8_unaligned_max2d_11c_relu_remainder: - - - beqz a7, dl_tie728_s8_unaligned_max2d_11c_relu_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a7 - -dl_tie728_s8_unaligned_max2d_11c_relu_end: - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_max2d_11c_prelu - .type dl_tie728_s8_unaligned_max2d_11c_prelu, @function - .section .iram1 -dl_tie728_s8_unaligned_max2d_11c_prelu: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: c_remainder - # a14: activation_alpha - # a15: activation_shift_ptr - - - l32i a6, a5, 64 - l32i a7, a5, 76 - l32i a14, a5, 56 - l32i a15, a5, 60 - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie718_s8_unaligned_max2d_11c_prelu_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_s8_unaligned_max2d_11c_prelu_0 - beqi a13, 8, dl_tie718_s8_unaligned_max2d_11c_prelu_1 - - - loopgtz a6, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VMAX.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie718_s8_unaligned_max2d_11c_prelu_remainder - -dl_tie718_s8_unaligned_max2d_11c_prelu_0: - - loopgtz a6, 1f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VMAX.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_s8_unaligned_max2d_11c_prelu_remainder - -dl_tie718_s8_unaligned_max2d_11c_prelu_1: - - loopgtz a6, 2f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMAX.S8 q2, q2, q5 - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VMAX.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - - j dl_tie718_s8_unaligned_max2d_11c_prelu_remainder - -dl_tie718_s8_unaligned_max2d_11c_prelu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a7 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a7 - rur.sar_byte a12 - -dl_tie718_s8_unaligned_max2d_11c_prelu_remainder: - - - beqz a7, dl_tie728_s8_unaligned_max2d_11c_prelu_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VMAX.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a7 - -dl_tie728_s8_unaligned_max2d_11c_prelu_end: - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_max_pool2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_max_pool2d.S deleted file mode 100644 index 73aecba8..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_max_pool2d.S +++ /dev/null @@ -1,497 +0,0 @@ -############################################################################################################################################################ -#### -#### dl_tie728_s8_max_pool2d series -#### -############################################################################################################################################################ - -#include "dl_tie728_s8.S" - - .align 4 - .text - .global dl_tie728_s8_max_pool2d_22c1 - .type dl_tie728_s8_max_pool2d_22c1, @function - .section .iram1 -dl_tie728_s8_max_pool2d_22c1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a11, a4, 104 # c_div_x_1 - - add a7, a3, a6 - add a8, a3, a5 - add a9, a8, a6 - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a7, 16 - EE.VLD.128.IP q2, a8, 16 - loopgtz a11, 0f - EE.VMAX.S8.LD.INCP q3, a9, q7, q0, q1 - EE.VMAX.S8.LD.INCP q0, a3, q7, q7, q2 - EE.VMAX.S8.LD.INCP q1, a7, q7, q7, q3 - EE.VLD.128.IP q2, a8, 16 - EE.VST.128.IP q7, a2, 16 - 0: - - EE.VMAX.S8.LD.INCP q3, a9, q7, q0, q1 - EE.VMAX.S8 q7, q7, q2 - EE.VMAX.S8 q7, q7, q3 - EE.VST.128.IP q7, a2, 16 - - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_max_pool2d_22c1 - .type dl_tie728_s8_unaligned_max_pool2d_22c1, @function - .section .iram1 -dl_tie728_s8_unaligned_max_pool2d_22c1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a10, a4, 4 # input_channel - l32i a11, a4, 104 # c_div_x_1 - l32i a12, a4, 60 # c_remainder - - add a7, a3, a6 - add a8, a3, a5 - add a9, a8, a6 - - blti a11, 0, dl_tie728_s8_unaligned_max_pool2d_22c1_remainder #channel < 16 - - EE.LD.128.USAR.IP q6, a2, 0 #get output_ptr sar_byte - rur.sar_byte a15 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q1, a3, 0 - - beqi a15, 0, 1f - beqi a15, 8, 2f - - loopgtz a11, 0f - EE.SRC.Q.LD.IP q2, a7, 16, q0, q1 - - EE.LD.128.USAR.IP q3, a7, 0 - EE.SRC.Q.LD.IP q4, a8, 16, q2, q3 - - EE.LD.128.USAR.IP q5, a8, 0 - EE.VMAX.S8 q7, q0, q2 - EE.SRC.Q.LD.IP q2, a9, 16, q4, q5 - - EE.LD.128.USAR.IP q3, a9, 0 - EE.VMAX.S8 q7, q7, q4 - EE.SRC.Q.LD.IP q0, a3, 16, q2, q3 - - EE.LD.128.USAR.IP q1, a3, 0 - EE.VMAX.S8 q7, q7, q2 - dl_tie728_s8_unaligned_store0 q7, a2, a14 - 0: - j dl_tie728_s8_unaligned_max_pool2d_22c1_loop_end - - -1: - loopgtz a11, 0f - EE.SRC.Q.LD.IP q2, a7, 16, q0, q1 - - EE.LD.128.USAR.IP q3, a7, 0 - EE.SRC.Q.LD.IP q4, a8, 16, q2, q3 - - EE.LD.128.USAR.IP q5, a8, 0 - EE.VMAX.S8 q7, q0, q2 - EE.SRC.Q.LD.IP q2, a9, 16, q4, q5 - - EE.LD.128.USAR.IP q3, a9, 0 - EE.VMAX.S8 q7, q7, q4 - EE.SRC.Q.LD.IP q0, a3, 16, q2, q3 - - EE.LD.128.USAR.IP q1, a3, 0 - EE.VMAX.S8 q7, q7, q2 - EE.VST.128.IP q7, a2, 16 - 0: - j dl_tie728_s8_unaligned_max_pool2d_22c1_loop_end - -2: - loopgtz a11, 0f - EE.SRC.Q.LD.IP q2, a7, 16, q0, q1 - - EE.LD.128.USAR.IP q3, a7, 0 - EE.SRC.Q.LD.IP q4, a8, 16, q2, q3 - - EE.LD.128.USAR.IP q5, a8, 0 - EE.VMAX.S8 q7, q0, q2 - EE.SRC.Q.LD.IP q2, a9, 16, q4, q5 - - EE.LD.128.USAR.IP q3, a9, 0 - EE.VMAX.S8 q7, q7, q4 - EE.SRC.Q.LD.IP q0, a3, 16, q2, q3 - - EE.LD.128.USAR.IP q1, a3, 0 - EE.VMAX.S8 q7, q7, q2 - dl_tie728_s8_unaligned_store1 q7, a2 - 0: - - -dl_tie728_s8_unaligned_max_pool2d_22c1_loop_end: - EE.SRC.Q.LD.IP q2, a7, 16, q0, q1 - - EE.LD.128.USAR.IP q3, a7, 0 - EE.SRC.Q.LD.IP q4, a8, 16, q2, q3 - - EE.LD.128.USAR.IP q5, a8, 0 - EE.VMAX.S8 q7, q0, q2 - EE.SRC.Q.LD.IP q2, a9, 16, q4, q5 - - EE.LD.128.USAR.IP q3, a9, 0 - EE.VMAX.S8 q7, q7, q4 - EE.SRC.Q q2, q2, q3 - EE.VMAX.S8 q7, q7, q2 - - dl_tie728_s8_unaligned_store0 q7, a2, a14 - - beqz a12, dl_tie728_s8_unaligned_max_pool2d_22c1_end - -dl_tie728_s8_unaligned_max_pool2d_22c1_remainder: - EE.LD.128.USAR.XP q0, a3, a12 - EE.VLD.128.IP q1, a3, 0 - EE.SRC.Q q0, q0, q1 - - EE.LD.128.USAR.XP q2, a7, a12 - EE.VLD.128.IP q3, a7, 0 - EE.SRC.Q q2, q2, q3 - - EE.LD.128.USAR.XP q4, a8, a12 - EE.VLD.128.IP q5, a8, 0 - EE.VMAX.S8 q7, q0, q2 - EE.SRC.Q q4, q4, q5 - - EE.LD.128.USAR.XP q2, a9, a12 - EE.VLD.128.IP q3, a9, 0 - EE.VMAX.S8 q7, q7, q4 - EE.SRC.Q q2, q2, q3 - - EE.VMAX.S8 q7, q7, q2 - - dl_tie728_s8_store_remainder q7, a8, a9, a10, a11, a2, a12 - -dl_tie728_s8_unaligned_max_pool2d_22c1_end: - - retw - - - - - -.macro dl_tie728_s8_max_pool2d_hw - 4: - - EE.VLD.128.XP q0, a13, a6 - loopgtz a10, 0f - EE.VLD.128.XP q1, a13, a6 - EE.VMAX.S8 q7, q7, q0 - - EE.VLD.128.XP q0, a13, a6 - EE.VMAX.S8 q7, q7, q1 - 0: - - bbci a9, 0, 2f - 1:#three left - EE.VLD.128.XP q1, a13, a6 - EE.VMAX.S8 q7, q7, q0 - EE.VLD.128.XP q0, a13, a6 - EE.VMAX.S8 q7, q7, q1 - EE.VMAX.S8 q7, q7, q0 - j 3f - - 2: # two left - EE.VLD.128.XP q1, a13, a6 - EE.VMAX.S8 q7, q7, q0 - EE.VMAX.S8 q7, q7, q1 - - 3: - addi a14, a14, -1 - add a7, a7, a5 - mov a13, a7 - bnez a14, 4b -.endm - - - .align 4 - .text - .global dl_tie728_s8_max_pool2d_hwc1 - .type dl_tie728_s8_max_pool2d, @function - .section .iram1 -dl_tie728_s8_max_pool2d_hwc1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a7, a4, 4 # input_channel - l32i a8, a4, 48 # filter_height - l32i a9, a4, 52 # filter_width - l32i a11, a4, 104 # c_div_x_1 - - srli a10, a9, 1 - addi a10, a10, -1 # filter_w / 2 - 1 - - beqi a9, 1, dl_tie728_s8_max_pool2d_h1c1 #filter_width == 1 - blti a11, 1, dl_tie728_s8_max_pool2d_hw_small_channel - 5: - mov a7, a3 - mov a13, a7 - EE.VLD.128.IP q7, a13, 0 - mov a14, a8 - - dl_tie728_s8_max_pool2d_hw - - EE.VST.128.IP q7, a2, 16 - addi a3, a3, 16 - addi a11, a11, -1 - bnez a11, 5b - -dl_tie728_s8_max_pool2d_hw_small_channel: - mov a7, a3 - mov a13, a7 - EE.VLD.128.IP q7, a13, 0 - mov a14, a8 - - dl_tie728_s8_max_pool2d_hw - EE.VST.128.IP q7, a2, 16 - - retw - - -dl_tie728_s8_max_pool2d_h1c1: - addi a8, a8, -1 - blti a11, 1, dl_tie728_s8_max_pool2d_h1_small_channel - 1: - mov a13, a3 - EE.VLD.128.XP q7, a13, a5 - loopgtz a8, 0f - EE.VLD.128.XP q0, a13, a5 - EE.VMAX.S8 q7, q7, q0 - 0: - - EE.VST.128.IP q7, a2, 16 - addi a3, a3, 16 - addi a11, a11, -1 - bnez a11, 1b - -dl_tie728_s8_max_pool2d_h1_small_channel: - mov a13, a3 - EE.VLD.128.XP q7, a13, a5 - loopgtz a8, 0f - EE.VLD.128.XP q0, a13, a5 - EE.VMAX.S8 q7, q7, q0 - 0: - - EE.VST.128.IP q7, a2, 16 - - retw - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_max_pool2d_hwc1 - .type dl_tie728_s8_unaligned_max_pool2d_hwc1, @function - .section .iram1 -dl_tie728_s8_unaligned_max_pool2d_hwc1: - .align 4 - entry sp, 16 - # a2: int16_t *output_ptr - # a3: int16_t *input_ptr - # a4: void *args - - l32i a5, a4, 16 # input_y_offset - l32i a6, a4, 20 # input_x_offset - l32i a7, a4, 4 # input_channel - l32i a8, a4, 48 # filter_height - l32i a9, a4, 52 # filter_width - l32i a11, a4, 104 # c_div_x_1 - l32i a12, a4, 60 # c_remainder - - srli a10, a9, 1 - addi a10, a10, -1 # filter_w / 2 - 1 - - addi a6, a6, -16 - - EE.LD.128.USAR.IP q6, a2, 0 #get output_ptr sar_byte - rur.sar_byte a15 - - addi a11, a11, 1 - - beqi a9, 1, dl_tie728_s8_unaligned_max_pool2d_h1c1 #filter_width == 1 - blti a11, 1, dl_tie728_s8_unaligned_max_pool2d_hw_remainder - - 9: - mov a7, a3 - mov a13, a7 - EE.LD.128.USAR.IP q6, a13, 16 - EE.LD.128.USAR.IP q7, a13, -16 - EE.SRC.Q q7, q6, q7 - mov a14, a8 - - 4: - EE.LD.128.USAR.IP q0, a13, 16 - EE.LD.128.USAR.XP q1, a13, a6 - loopgtz a10, 0f - EE.SRC.Q.LD.IP q2, a13, 16, q0, q1 - EE.LD.128.USAR.XP q1, a13, a6 - EE.VMAX.S8 q7, q7, q0 - - EE.SRC.Q.LD.IP q0, a13, 16, q2, q1 - EE.LD.128.USAR.XP q1, a13, a6 - EE.VMAX.S8 q7, q7, q2 - 0: - - bbci a9, 0, 2f - 1:#three left - EE.SRC.Q.LD.IP q2, a13, 16, q0, q1 - EE.LD.128.USAR.XP q1, a13, a6 - EE.VMAX.S8 q7, q7, q0 - - EE.SRC.Q.LD.IP q0, a13, 16, q2, q1 - EE.LD.128.USAR.XP q1, a13, a6 - EE.VMAX.S8 q7, q7, q2 - - EE.SRC.Q q0, q0, q1 - EE.VMAX.S8 q7, q7, q0 - - j 3f - - 2:# two left - EE.SRC.Q.LD.IP q2, a13, 16, q0, q1 - EE.LD.128.USAR.XP q1, a13, a6 - EE.VMAX.S8 q7, q7, q0 - - EE.SRC.Q q2, q2, q1 - EE.VMAX.S8 q7, q7, q2 - - 3: - addi a14, a14, -1 - add a7, a7, a5 - mov a13, a7 - bnez a14, 4b - - beqi a15, 0, 5f - beqi a15, 8, 6f - - dl_tie728_s8_unaligned_store0 q7, a2, a14 - j 7f - - 5: - EE.VST.128.IP q7, a2, 16 - j 7f - 6: - dl_tie728_s8_unaligned_store1 q7, a2 - - 7: - addi a3, a3, 16 - addi a11, a11, -1 - bnez a11, 9b - -dl_tie728_s8_unaligned_max_pool2d_hw_remainder: - beqz a12, dl_tie728_s8_unaligned_max_pool2d_hw_remainder_end - - mov a7, a3 - mov a13, a7 - EE.LD.128.USAR.IP q6, a13, 16 - EE.LD.128.USAR.IP q7, a13, -16 - EE.SRC.Q q7, q6, q7 - mov a14, a8 - addi a6, a6, 16 - sub a6, a6, a12 - - 1: - loopgtz a9, 0f - EE.LD.128.USAR.XP q0, a13, a12 - EE.VLD.128.XP q1, a13, a6 - EE.SRC.Q q0, q0, q1 - EE.VMAX.S8 q7, q7, q0 - 0: - addi a14, a14, -1 - add a7, a7, a5 - mov a13, a7 - bnez a14, 1b - - dl_tie728_s8_store_remainder q7, a8, a9, a10, a11, a2, a12 -dl_tie728_s8_unaligned_max_pool2d_hw_remainder_end: - - retw - - -dl_tie728_s8_unaligned_max_pool2d_h1c1: - addi a5, a5, -16 - addi a8, a8, -1 - blti a11, 1, dl_tie728_s8_unaligned_max_pool2d_h1_remainder - - 5: - mov a13, a3 - EE.LD.128.USAR.IP q6, a13, 16 - EE.VLD.128.XP q7, a13, a5 - EE.SRC.Q q7, q6, q7 - - loopgtz a8, 0f - EE.LD.128.USAR.IP q0, a13, 16 - EE.LD.128.USAR.XP q1, a13, a5 - EE.SRC.Q q0, q0, q1 - EE.VMAX.S8 q7, q7, q0 - 0: - - beqi a15, 0, 1f - beqi a15, 8, 2f - - dl_tie728_s8_unaligned_store0 q7, a2, a14 - j 3f - 1: - EE.VST.128.IP q7, a2, 16 - j 3f - 2: - dl_tie728_s8_unaligned_store1 q7, a2 - - 3: - addi a3, a3, 16 - addi a11, a11, -1 - bnez a11, 5b - -dl_tie728_s8_unaligned_max_pool2d_h1_remainder: - - beqz a12, dl_tie728_s8_unaligned_max_pool2d_h1_remainder_end - - addi a5, a5, 16 - sub a5, a5, a12 - - mov a13, a3 - EE.LD.128.USAR.XP q6, a13, a12 - EE.VLD.128.XP q7, a13, a5 - EE.SRC.Q q7, q6, q7 - - loopgtz a8, 0f - EE.LD.128.USAR.XP q0, a13, a12 - EE.VLD.128.XP q1, a13, a5 - EE.SRC.Q q0, q0, q1 - EE.VMAX.S8 q7, q7, q0 - 0: - - dl_tie728_s8_store_remainder q7, a8, a9, a10, a11, a2, a12 - -dl_tie728_s8_unaligned_max_pool2d_h1_remainder_end: - retw diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_min2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_min2d.S deleted file mode 100644 index 801cf771..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_min2d.S +++ /dev/null @@ -1,187 +0,0 @@ -#include "dl_tie728_s8.S" - - -############################################################################################################################################################ -#### -#### tie728_s8_min2d_11c series -#### -############################################################################################################################################################ - - - .align 4 - .text - .global dl_tie728_s8_min2d_11c - .type dl_tie728_s8_min2d_11c, @function - .section .iram1 -dl_tie728_s8_min2d_11c: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - - l32i a6, a5, 64 - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.VMIN.S8.LD.INCP q0, a3, q2, q0, q1 - EE.VLD.128.IP q1, a4, 16 - EE.VST.128.IP q2, a2, 16 - 0: - - EE.VMIN.S8 q2, q0, q1 - EE.VST.128.IP q2, a2, 16 - - retw - - - - - - -############################################################################################################################################################ -#### -#### tie728_s8_unaligned_min2d_11c series -#### -############################################################################################################################################################ - .align 4 - .text - .global dl_tie728_s8_unaligned_min2d_11c - .type dl_tie728_s8_unaligned_min2d_11c, @function - .section .iram1 -dl_tie728_s8_unaligned_min2d_11c: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: c_remainder - - - l32i a6, a5, 64 - l32i a7, a5, 76 - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie718_s8_unaligned_min2d_11c_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_s8_unaligned_min2d_11c_0 - beqi a13, 8, dl_tie718_s8_unaligned_min2d_11c_1 - - - loopgtz a6, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMIN.S8 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMIN.S8 q2, q2, q5 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie718_s8_unaligned_min2d_11c_remainder - -dl_tie718_s8_unaligned_min2d_11c_0: - - loopgtz a6, 1f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMIN.S8 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMIN.S8 q2, q2, q5 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_s8_unaligned_min2d_11c_remainder - -dl_tie718_s8_unaligned_min2d_11c_1: - - loopgtz a6, 2f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMIN.S8 q2, q2, q5 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a7 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMIN.S8 q2, q2, q5 - dl_tie728_s8_unaligned_store1 q2, a2 - - j dl_tie718_s8_unaligned_min2d_11c_remainder - -dl_tie718_s8_unaligned_min2d_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a7 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a7 - rur.sar_byte a12 - -dl_tie718_s8_unaligned_min2d_11c_remainder: - - - beqz a7, dl_tie728_s8_unaligned_min2d_11c_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VMIN.S8 q2, q2, q5 - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a7 - -dl_tie728_s8_unaligned_min2d_11c_end: - retw - - diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_mul2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_mul2d.S deleted file mode 100644 index be6e24e1..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_mul2d.S +++ /dev/null @@ -1,683 +0,0 @@ -#include "dl_tie728_s8.S" - - -############################################################################################################################################################ -#### -#### tie728_s8_mul2d_11c series -#### -############################################################################################################################################################ - - .align 4 - .text - .global dl_tie728_s8_mul2d_11c - .type dl_tie728_s8_mul2d_11c, @function - .section .iram1 -dl_tie728_s8_mul2d_11c: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: mul_shift - - l32i a6, a5, 64 - l32i a7, a5, 100 - l32i a8, a5, 76 - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.ZERO.QACC - EE.VMULAS.S8.QACC.LD.IP q0, a3, 16, q0, q1 - EE.VLD.128.IP q1, a4, 16 - # EE.SRCMB.S8.QACC q2, a7, 0 - tie728_s8_vector_round_result q2, a7, a10, q7 - EE.VST.128.IP q2, a2, 16 - 0: - - EE.ZERO.QACC - EE.VMULAS.S8.QACC q0, q1 - # EE.SRCMB.S8.QACC q2, a7, 0 - tie728_s8_vector_round_result q2, a7, a10, q7 - - EE.VST.128.IP q2, a2, 16 - - retw - - - - - .align 4 - .text - .global dl_tie728_s8_mul2d_11c_relu - .type dl_tie728_s8_mul2d_11c_relu, @function - .section .iram1 -dl_tie728_s8_mul2d_11c_relu: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: mul_shift - # a14: activation_alpha - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 100 - l32i a8, a5, 76 - l32i a14, a5, 52 - l32i a15, a5, 60 - - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.ZERO.QACC - EE.VMULAS.S8.QACC.LD.IP q0, a3, 16, q0, q1 - EE.VLD.128.IP q1, a4, 16 - # EE.SRCMB.S8.QACC q2, a7, 0 - tie728_s8_vector_round_result q2, a7, a10, q7 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - 0: - - EE.ZERO.QACC - EE.VMULAS.S8.QACC q0, q1 - # EE.SRCMB.S8.QACC q2, a7, 0 - tie728_s8_vector_round_result q2, a7, a10, q7 - - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - - retw - - - - - .align 4 - .text - .global dl_tie728_s8_mul2d_11c_prelu - .type dl_tie728_s8_mul2d_11c_prelu, @function - .section .iram1 -dl_tie728_s8_mul2d_11c_prelu: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: mul_shift - # a14: activation_alpha_ptr - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 100 - l32i a14, a5, 56 - l32i a15, a5, 60 - - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.ZERO.QACC - EE.VMULAS.S8.QACC.LD.IP q0, a3, 16, q0, q1 - EE.VLD.128.IP q1, a4, 16 - - EE.VLD.128.IP q3, a14, 16 - # EE.SRCMB.S8.QACC q2, a7, 0 - tie728_s8_vector_round_result q2, a7, a10, q7 - EE.VPRELU.S8 q2, q2, q3, a15 - EE.VST.128.IP q2, a2, 16 - 0: - - EE.ZERO.QACC - EE.VMULAS.S8.QACC q0, q1 - EE.VLD.128.IP q3, a14, 16 - # EE.SRCMB.S8.QACC q2, a7, 0 - tie728_s8_vector_round_result q2, a7, a10, q7 - - EE.VPRELU.S8 q2, q2, q3, a15 - EE.VST.128.IP q2, a2, 16 - - retw - - - - - - -############################################################################################################################################################ -#### -#### tie728_s8_unaligned_mul2d_11c series -#### -############################################################################################################################################################ - - .align 4 - .text - .global dl_tie728_s8_unaligned_mul2d_11c - .type dl_tie728_s8_unaligned_mul2d_11c, @function - .section .iram1 -dl_tie728_s8_unaligned_mul2d_11c: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: c_remainder - # a8: mul_shift - - - l32i a6, a5, 64 - l32i a7, a5, 76 - l32i a8, a5, 100 - - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie718_s8_unaligned_mul2d_11c_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_s8_unaligned_mul2d_11c_0 - beqi a13, 8, dl_tie718_s8_unaligned_mul2d_11c_1 - - - loopgtz a6, 0f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie718_s8_unaligned_mul2d_11c_remainder - -dl_tie718_s8_unaligned_mul2d_11c_0: - - loopgtz a6, 1f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VST.128.IP q2, a2, 16 - 1: - - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_s8_unaligned_mul2d_11c_remainder - -dl_tie718_s8_unaligned_mul2d_11c_1: - - loopgtz a6, 2f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - dl_tie728_s8_unaligned_store1 q2, a2 - - j dl_tie718_s8_unaligned_mul2d_11c_remainder - -dl_tie718_s8_unaligned_mul2d_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a7 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a7 - rur.sar_byte a12 - -dl_tie718_s8_unaligned_mul2d_11c_remainder: - - - beqz a7, dl_tie728_s8_unaligned_mul2d_11c_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.ZERO.QACC - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a7 - -dl_tie728_s8_unaligned_mul2d_11c_end: - retw - - - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_mul2d_11c_relu - .type dl_tie728_s8_unaligned_mul2d_11c_relu, @function - .section .iram1 -dl_tie728_s8_unaligned_mul2d_11c_relu: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: c_remainder - # a8: mul_shift - # a14: activation_alpha - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 76 - l32i a8, a5, 100 - l32i a14, a5, 52 - l32i a15, a5, 60 - - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie718_s8_unaligned_mul2d_11c_relu_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_s8_unaligned_mul2d_11c_relu_0 - beqi a13, 8, dl_tie718_s8_unaligned_mul2d_11c_relu_1 - - - loopgtz a6, 0f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie718_s8_unaligned_mul2d_11c_relu_remainder - -dl_tie718_s8_unaligned_mul2d_11c_relu_0: - - loopgtz a6, 1f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_s8_unaligned_mul2d_11c_relu_remainder - -dl_tie718_s8_unaligned_mul2d_11c_relu_1: - - loopgtz a6, 2f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - j dl_tie718_s8_unaligned_mul2d_11c_relu_remainder - -dl_tie718_s8_unaligned_mul2d_11c_relu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a7 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a7 - rur.sar_byte a12 - -dl_tie718_s8_unaligned_mul2d_11c_relu_remainder: - - - beqz a7, dl_tie728_s8_unaligned_mul2d_11c_relu_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.ZERO.QACC - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.VRELU.S8 q2, a14, a15 - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a7 - -dl_tie728_s8_unaligned_mul2d_11c_relu_end: - retw - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_mul2d_11c_prelu - .type dl_tie728_s8_unaligned_mul2d_11c_prelu, @function - .section .iram1 -dl_tie728_s8_unaligned_mul2d_11c_prelu: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: c_remainder - # a8: mul_shift - # a14: activation_alpha_ptr - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 76 - l32i a8, a5, 100 - l32i a14, a5, 56 - l32i a15, a5, 60 - - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie718_s8_unaligned_mul2d_11c_prelu_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_s8_unaligned_mul2d_11c_prelu_0 - beqi a13, 8, dl_tie718_s8_unaligned_mul2d_11c_prelu_1 - - - loopgtz a6, 0f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - EE.VLD.128.IP q6, a14, 16 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie718_s8_unaligned_mul2d_11c_prelu_remainder - -dl_tie718_s8_unaligned_mul2d_11c_prelu_0: - - loopgtz a6, 1f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - EE.VLD.128.IP q6, a14, 16 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.VPRELU.S8 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_s8_unaligned_mul2d_11c_prelu_remainder - -dl_tie718_s8_unaligned_mul2d_11c_prelu_1: - - loopgtz a6, 2f - EE.ZERO.QACC - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a7 - EE.ZERO.QACC - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a7 - rur.sar_byte a12 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.VMULAS.S8.QACC q2, q5 - EE.VLD.128.IP q6, a14, 16 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - j dl_tie718_s8_unaligned_mul2d_11c_prelu_remainder - - -dl_tie718_s8_unaligned_mul2d_11c_prelu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a7 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a7 - rur.sar_byte a12 - -dl_tie718_s8_unaligned_mul2d_11c_prelu_remainder: - - beqz a7, dl_tie728_s8_unaligned_mul2d_11c_prelu_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.ZERO.QACC - EE.VMULAS.S8.QACC q2, q5 - EE.VLD.128.IP q6, a14, 16 - # EE.SRCMB.S8.QACC q2, a8, 0 - tie728_s8_vector_round_result q2, a8, a10, q7 - EE.VPRELU.S8 q2, q2, q6, a15 - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a7 - -dl_tie728_s8_unaligned_mul2d_11c_prelu_end: - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_prelu.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_prelu.S deleted file mode 100644 index 836571a7..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_prelu.S +++ /dev/null @@ -1,78 +0,0 @@ -#include "dl_tie728_s8.S" - - .align 4 - .text - .global dl_tie728_s8_prelu_11c - .type dl_tie728_s8_prelu_11c, @function - .section .iram1 -dl_tie728_s8_prelu_11c: - .align 4 - entry sp, 24 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - # a5: c_div_x_1 - # a12: activation_alpha_ptr - # a13: activation_shift - # a14: output_shift - # a15: output_scale - - - l32i a5, a4, 100 - l32i a12, a4, 80 # activation_alpha_ptr - l32i a13, a4, 84 # activation_shift - l32i a14, a4, 172 # output_shift - l32i a15, a4, 176 # output_scale - - ee.vld.128.ip q0, a3, 16 - ee.vld.128.ip q1, a12, 16 - movi a6, 0 - s8i a6, sp, 0 - ee.vldbc.8.ip q2, sp, 0 # all 0 - s8i a15, sp, 0 - ee.vldbc.8.ip q3, sp, 0 # all output_scale - - - loopgtz a5, 0f - ee.vcmp.gt.s8 q4, q0, q2 - ee.notq q5, q4 - - ee.zero.qacc - # alpha * input - ee.vmulas.s8.qacc q0, q1 - # right shift: output - alpha - input - tie728_s8_vector_round_result q1, a13, a6, q6 - - # *scale/right shift: output - input - ee.zero.qacc - ee.vmulas.s8.qacc q0, q3 - tie728_s8_vector_round_result q0, a14, a6, q6 - ee.andq q0, q0, q4 - ee.andq q1, q1, q5 - ee.vadds.s8.ld.incp q1, a12, q0, q0, q1 - - ee.vst.128.ip q0, a2, 16 - ee.vld.128.ip q0, a3, 16 - 0: - - ee.vcmp.gt.s8 q4, q0, q2 - ee.notq q5, q4 - - ee.zero.qacc - # alpha * input - ee.vmulas.s8.qacc q0, q1 - # right shift: output - alpha - input - tie728_s8_vector_round_result q1, a13, a6, q6 - - # *scale/right shift: output - input - ee.zero.qacc - ee.vmulas.s8.qacc q0, q3 - tie728_s8_vector_round_result q0, a14, a6, q6 - ee.andq q0, q0, q4 - ee.andq q1, q1, q5 - ee.vadds.s8 q0, q0, q1 - - ee.vst.128.ip q0, a2, 0 - - retw diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_relu.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_relu.S deleted file mode 100644 index 19ec837b..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_relu.S +++ /dev/null @@ -1,164 +0,0 @@ -#include "dl_tie728_s8.S" - - .align 4 - .text - .global dl_tie728_s8_relu_11c - .type dl_tie728_s8_relu_11c, @function - .section .iram1 -dl_tie728_s8_relu_11c: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - # a5: c_rs1_1: c / 2x - 1 - # a6: c_rs2_1: c_left_1 - # a14: activation_alpha - # a15: activation_shift - - - l32i a5, a4, 88 - l32i a6, a4, 92 - l32i a7, a4, 136 - l32i a14, a4, 76 # activation_alpha - l32i a15, a4, 84 # activation_shift - - - loopgtz a5, 0f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a3, 16 - - EE.VRELU.S8 q0, a14, a15 - EE.VST.128.IP q0, a2, 16 - - EE.VRELU.S8 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - 0: - - - loopgtz a6, 1f - EE.VLD.128.IP q0, a3, 16 - EE.VRELU.S8 q0, a14, a15 - EE.VST.128.IP q0, a2, 16 - 1: - - - EE.VLD.128.IP q0, a3, 16 - - EE.VRELU.S8 q0, a14, a15 - EE.VST.128.IP q0, a2, 16 - - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_relu_11c - .type dl_tie728_s8_unaligned_relu_11c, @function - .section .iram1 -dl_tie728_s8_unaligned_relu_11c: - .align 4 - entry sp, 16 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - # a5: c_div_x_1 - # a6: c_remainder - # a14: activation_alpha - # a15: activation_shift - - - l32i a5, a4, 100 - l32i a6, a4, 136 - l32i a14, a4, 76 # activation_alpha - l32i a15, a4, 84 # activation_shift - - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a5, 0, dl_tie718_s8_unaligned_relu_11c_small_remainder # channel < 16 - - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie718_s8_unaligned_relu_11c_0 - beqi a13, 8, dl_tie718_s8_unaligned_relu_11c_1 - - - loopgtz a5, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a6 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie718_s8_unaligned_relu_11c_remainder - - -dl_tie718_s8_unaligned_relu_11c_0: - loopgtz a5, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - 0: - addi a3, a3, -16 - add a3, a3, a6 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie718_s8_unaligned_relu_11c_remainder - - -dl_tie718_s8_unaligned_relu_11c_1: - - loopgtz a5, 0f - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - 0: - addi a3, a3, -16 - add a3, a3, a6 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - j dl_tie718_s8_unaligned_relu_11c_remainder - - -dl_tie718_s8_unaligned_relu_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a6 - rur.sar_byte a11 - - -dl_tie718_s8_unaligned_relu_11c_remainder: - - beqz a6, dl_tie728_s8_unaligned_relu_11c_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.VRELU.S8 q2, a14, a15 - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a6 - -dl_tie728_s8_unaligned_relu_11c_end: - retw diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_resize2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_resize2d.S deleted file mode 100644 index 75c7678f..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_resize2d.S +++ /dev/null @@ -1,115 +0,0 @@ -#include "dl_tie728_s8.S" - - .align 4 - .text - .global dl_tie728_s8_resize2d_nearest_2x2_c1 - .type dl_tie728_s8_resize2d_nearest_2x2_c1, @function -dl_tie728_s8_resize2d_nearest_2x2_c1: - .align 4 - entry sp, 24 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - # a5: output_x_offset - # a6: output_y_offset - # a7: c_div_x - # a8: remainder - # a9: output_shift - # a10: output_scale - - l32i a5, a4, 20 - l32i a6, a4, 24 - l32i a7, a4, 40 - l32i a8, a4, 44 - l32i a9, a4, 48 - l32i a10, a4, 52 - - # a11 (0, 1) - # a12 (1, 0) - # a13 (1, 1) - add a11, a2, a5 - add a12, a2, a6 - add a13, a11, a6 - - s8i a10, sp, 0 - ee.vldbc.8.ip q1, sp, 0 # all output_scale - ee.vld.128.ip q0, a3, 16 - - loopgtz a7, 0f - ee.zero.qacc - ee.vmulas.s8.qacc.ld.ip q0, a3, 16, q0, q1 - tie728_s8_vector_round_result q2, a9, a14, q7 - ee.vst.128.ip q2, a2, 16 - ee.vst.128.ip q2, a11, 16 - ee.vst.128.ip q2, a12, 16 - ee.vst.128.ip q2, a13, 16 - 0: - retw - - .align 4 - .text - .global dl_tie728_s8_unaligned_resize2d_nearest_2x2_c1 - .type dl_tie728_s8_unaligned_resize2d_nearest_2x2_c1, @function -dl_tie728_s8_unaligned_resize2d_nearest_2x2_c1: - .align 4 - entry sp, 24 - - # a2: int8_t *output_ptr - # a3: int8_t *input_ptr - # a4: void *args - # a5: output_x_offset - # a6: output_y_offset - # a7: c_div_x - # a8: remainder - # a9: output_shift - # a10: output_scale - - - l32i a5, a4, 20 - l32i a6, a4, 24 - l32i a7, a4, 40 - l32i a8, a4, 44 - l32i a9, a4, 48 - l32i a10, a4, 52 - - - # a11 (0, 1) - # a12 (1, 0) - # a13 (1, 1) - add a11, a2, a5 - add a12, a2, a6 - add a13, a11, a6 - - s8i a10, sp, 0 - ee.vldbc.8.ip q3, sp, 0 # all output_scale - ee.ld.128.usar.ip q0, a3, 16 - - loopgtz a7, 0f - ee.zero.qacc - ee.ld.128.usar.ip q1, a3, 16 - ee.src.q.qup q2, q0, q1 - ee.vmulas.s8.qacc q2, q3 - tie728_s8_vector_round_result q4, a9, a14, q7 - dl_tie728_s8_unaligned_store0 q4, a2, a14 - dl_tie728_s8_unaligned_store0 q4, a11, a14 - dl_tie728_s8_unaligned_store0 q4, a12, a14 - dl_tie728_s8_unaligned_store0 q4, a13, a14 - 0: - - bnez a8, dl_tie728_s8_unaligned_resize2d_nearest_2x2_c1_remainder - retw - -dl_tie728_s8_unaligned_resize2d_nearest_2x2_c1_remainder: - ee.zero.qacc - ee.ld.128.usar.ip q1, a3, 16 - ee.src.q.qup q2, q0, q1 - ee.vmulas.s8.qacc q2, q3 - tie728_s8_vector_round_result q4, a9, a14, q7 - dl_tie728_s8_store_remainder q4, a5, a6, a7, a10, a2, a8 - dl_tie728_s8_store_remainder q4, a5, a6, a7, a10, a11, a8 - dl_tie728_s8_store_remainder q4, a5, a6, a7, a10, a12, a8 - dl_tie728_s8_store_remainder q4, a5, a6, a7, a10, a13, a8 - retw - - \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_sub2d.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_sub2d.S deleted file mode 100644 index 660404bc..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/tie728/dl_tie728_s8_sub2d.S +++ /dev/null @@ -1,1692 +0,0 @@ -#include "dl_tie728_s8.S" - -############################################################################################################################################################ -#### -#### tie728_s8_sub2d_11c series -#### -############################################################################################################################################################ - -.macro dl_tie728_rescale_sub_rescale_output input0, input1, output, output_scale, output_shift, tmpq, rescale_input - - EE.ZERO.Q \tmpq - EE.VSUBS.S8 \tmpq, \tmpq, \output_scale - EE.ZERO.QACC - - blti \rescale_input, 2, 10f -# input1 is in the front - EE.VMULAS.S8.QACC \input1, \output_scale - EE.VMULAS.S8.QACC \input0, \tmpq - EE.SRCMB.S8.QACC \output, \output_shift, 0 - - j 11f - -10: # input0 is in the front - EE.VMULAS.S8.QACC \input0, \output_scale - EE.VMULAS.S8.QACC \input1, \tmpq - EE.SRCMB.S8.QACC \output, \output_shift, 0 - -11: - -.endm - - - - .align 4 - .text - .global dl_tie728_s8_sub2d_11c - .type dl_tie728_s8_sub2d_11c, @function - .section .iram1 -dl_tie728_s8_sub2d_11c: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_2x_1 - # a7: c_left_x_1 - - l32i a6, a5, 68 - l32i a7, a5, 72 - - blti a6, 1, dl_tie728_s8_sub2d_small_channel - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.VLD.128.IP q2, a3, 16 - EE.VSUBS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VSUBS.S8.LD.INCP q1, a4, q5, q2, q3 - EE.VST.128.IP q5, a2, 16 - 0: - - beqi a7, 1, 2f # remainder == 2*16byte - beqi a7, 2, 3f # remainder == 3*16byte - - 2: - EE.VLD.128.IP q2, a3, 16 - EE.VSUBS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - - EE.VSUBS.S8 q5, q2, q3 - EE.VST.128.IP q5, a2, 16 - retw - - 3: - EE.VLD.128.IP q2, a3, 16 - EE.VSUBS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VSUBS.S8.LD.INCP q1, a4, q5, q2, q3 - EE.VST.128.IP q5, a2, 16 - - EE.VSUBS.S8 q4, q0, q1 - EE.VST.128.IP q4, a2, 16 - retw - - -dl_tie728_s8_sub2d_small_channel: # channel < 3*s (16) - - loopgtz a7, 0f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - EE.VSUBS.S8 q2, q0, q1 - EE.VST.128.IP q2, a2, 16 - 0: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - EE.VSUBS.S8 q2, q0, q1 - EE.VST.128.IP q2, a2, 16 - - retw - - - - - .align 4 - .text - .global dl_tie728_s8_rescale_sub2d_11c - .type dl_tie728_s8_rescale_sub2d_11c, @function - .section .iram1 -dl_tie728_s8_rescale_sub2d_11c: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr: >> shift or *scale) >> shift - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr: the one need to be rescaled - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a12: rescale_input - - - l32i a6, a5, 64 - l32i a7, a5, 88 - l32i a8, a5, 96 - l32i a9, a5, 92 - l32i a12, a5, 80 - - beqi a8, 1, dl_tie728_s8_rescale_sub2d_output - -dl_tie728_s8_rescale_sub2d_output_scale: # *scale) >> shift - - s8i a8, a1, 0 - EE.VLDBC.8 q7, a1 # all output_scale - - loopgtz a6, dl_tie728_s8_rescale_sub2d_11c_output - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_sub_rescale_output q0, q1, q1, q7, a9, q4, a12 - - EE.VST.128.IP q1, a2, 16 - dl_tie728_s8_rescale_sub2d_11c_output: - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_sub_rescale_output q0, q1, q1, q7, a9, q4, a12 - - EE.VST.128.IP q1, a2, 16 - - retw - - - -dl_tie728_s8_rescale_sub2d_output: # >> shift - movi a13, -1 - s8i a13, a1, 0 - EE.VLDBC.8 q7, a1 # all -1 - - blti a12, 2, dl_tie728_s8_rescale_sub2d_output_0 - -# input1 in the front - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - loopgtz a6, dl_tie728_s8_rescale_sub2d_11c_1 - EE.SRCMB.S8.QACC q1, a7, 0 - EE.VMULAS.S8.QACC.LD.IP q0, a3, 16, q0, q7 # input1 >> shift - input0 - EE.SRCMB.S8.QACC q1, a9, 0 - EE.LDQA.S8.128.IP a4, 16 - EE.VST.128.IP q1, a2, 16 - dl_tie728_s8_rescale_sub2d_11c_1: - - EE.SRCMB.S8.QACC q1, a7, 0 - EE.VMULAS.S8.QACC.LD.IP q0, a3, 16, q0, q7 # input1 >> shift - input0 - EE.SRCMB.S8.QACC q1, a9, 0 - - EE.VST.128.IP q1, a2, 16 - retw - - -# input0 in the front -dl_tie728_s8_rescale_sub2d_output_0: - - EE.LDQA.S8.128.IP a4, 16 - loopgtz a6, dl_tie728_s8_rescale_sub2d_11c_0 - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.LDQA.S8.128.IP a3, 16 - EE.VMULAS.S8.QACC q1, q7 # input0 - input1 >> shift - EE.SRCMB.S8.QACC q1, a9, 0 - - EE.LDQA.S8.128.IP a4, 16 - EE.VST.128.IP q1, a2, 16 - dl_tie728_s8_rescale_sub2d_11c_0: - - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.LDQA.S8.128.IP a3, 16 - EE.VMULAS.S8.QACC q1, q7 # input0 - input1 >> shift - EE.SRCMB.S8.QACC q1, a9, 0 - - EE.VST.128.IP q1, a2, 16 - retw - - - - - .align 4 - .text - .global dl_tie728_s8_sub2d_11c_relu - .type dl_tie728_s8_sub2d_11c_relu, @function - .section .iram1 -dl_tie728_s8_sub2d_11c_relu: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_2x_1 - # a7: c_left_x_1 - # a14: activation_alpha - # a15: activation_shift - - - l32i a6, a5, 68 - l32i a7, a5, 72 - l32i a14, a5, 52 - l32i a15, a5, 60 - - blti a6, 1, dl_tie728_s8_sub2d_relu_small_channel - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.VLD.128.IP q2, a3, 16 - EE.VSUBS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VRELU.S8 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VSUBS.S8.LD.INCP q1, a4, q5, q2, q3 - EE.VRELU.S8 q5, a14, a15 - EE.VST.128.IP q5, a2, 16 - 0: - - beqi a7, 1, 2f #remainder == 2*16byte - beqi a7, 2, 3f #remainder == 3*16byte - - 2: - EE.VLD.128.IP q2, a3, 16 - EE.VSUBS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VRELU.S8 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VSUBS.S8 q5, q2, q3 - EE.VRELU.S8 q5, a14, a15 - EE.VST.128.IP q5, a2, 16 - retw - - 3: - EE.VLD.128.IP q2, a3, 16 - EE.VSUBS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VRELU.S8 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VSUBS.S8.LD.INCP q1, a4, q5, q2, q3 - EE.VRELU.S8 q5, a14, a15 - EE.VST.128.IP q5, a2, 16 - - EE.VSUBS.S8 q4, q0, q1 - EE.VRELU.S8 q4, a14, a15 - EE.VST.128.IP q4, a2, 16 - retw - - -dl_tie728_s8_sub2d_relu_small_channel: # channel < 3*s - - loopgtz a7, 0f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - EE.VSUBS.S8 q2, q0, q1 - - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - 0: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - EE.VSUBS.S8 q2, q0, q1 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - - retw - - - - - .align 4 - .text - .global dl_tie728_s8_rescale_sub2d_11c_relu - .type dl_tie728_s8_rescale_sub2d_11c_relu, @function - .section .iram1 -dl_tie728_s8_rescale_sub2d_11c_relu: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr: >> shift or *scale) >> shift - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr: the one need to be rescaled - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a12: rescale_input - # a14: activation_alpha - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 88 - l32i a8, a5, 96 - l32i a9, a5, 92 - l32i a12, a5, 80 - l32i a14, a5, 52 - l32i a15, a5, 60 - - - beqi a8, 1, dl_tie728_s8_rescale_sub2d_output_relu - - -dl_tie728_s8_rescale_sub2d_output_scale_relu: # *scale) >> shift - - s8i a8, a1, 0 - EE.VLDBC.8 q7, a1 # all output_scale - - loopgtz a6, dl_tie728_s8_rescale_sub2d_11c_output_relu - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_sub_rescale_output q0, q1, q1, q7, a9, q4, a12 - - EE.VRELU.S8 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - dl_tie728_s8_rescale_sub2d_11c_output_relu: - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_sub_rescale_output q0, q1, q1, q7, a9, q4, a12 - - EE.VRELU.S8 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - retw - - - -dl_tie728_s8_rescale_sub2d_output_relu: # >> shift - movi a13, -1 - s8i a13, a1, 0 - EE.VLDBC.8 q7, a1 # all -1 - - blti a12, 2, dl_tie728_s8_rescale_sub2d_output_relu_0 - -# input1 in the front - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - loopgtz a6, dl_tie728_s8_rescale_sub2d_11c_relu_1 - EE.SRCMB.S8.QACC q1, a7, 0 - EE.VMULAS.S8.QACC.LD.IP q0, a3, 16, q0, q7 # input1 >> shift - input0 - EE.SRCMB.S8.QACC q1, a9, 0 - EE.LDQA.S8.128.IP a4, 16 - EE.VRELU.S8 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - dl_tie728_s8_rescale_sub2d_11c_relu_1: - - EE.SRCMB.S8.QACC q1, a7, 0 - EE.VMULAS.S8.QACC.LD.IP q0, a3, 16, q0, q7 # input1 >> shift - input0 - EE.SRCMB.S8.QACC q1, a9, 0 - - EE.VRELU.S8 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - retw - - -# input0 in the front -dl_tie728_s8_rescale_sub2d_output_relu_0: - - EE.LDQA.S8.128.IP a4, 16 - loopgtz a6, dl_tie728_s8_rescale_sub2d_11c_relu_0 - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.LDQA.S8.128.IP a3, 16 - EE.VMULAS.S8.QACC q1, q7 # input0 - input1 >> shift - EE.SRCMB.S8.QACC q1, a9, 0 - - EE.LDQA.S8.128.IP a4, 16 - EE.VRELU.S8 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - dl_tie728_s8_rescale_sub2d_11c_relu_0: - - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.LDQA.S8.128.IP a3, 16 - EE.VMULAS.S8.QACC q1, q7 # input0 - input1 >> shift - EE.SRCMB.S8.QACC q1, a9, 0 - - EE.VRELU.S8 q1, a14, a15 - EE.VST.128.IP q1, a2, 16 - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_sub2d_11c_prelu - .type dl_tie728_s8_sub2d_11c_prelu, @function - .section .iram1 -dl_tie728_s8_sub2d_11c_prelu: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_2x_1 - # a7: c_left_x_1 - # a14: activation_alpha_ptr - # a15: activation_shift - - - l32i a6, a5, 68 - l32i a7, a5, 72 - l32i a14, a5, 56 - l32i a15, a5, 60 - - blti a6, 1, dl_tie728_s8_sub2d_prelu_small_channel - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - loopgtz a6, 0f - EE.VLD.128.IP q2, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VPRELU.S8 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8.LD.INCP q1, a4, q5, q2, q3 - EE.VPRELU.S8 q5, q5, q6, a15 - EE.VST.128.IP q5, a2, 16 - 0: - - beqi a7, 1, 2f #remainder == 2*16byte - beqi a7, 2, 3f #remainder == 3*16byte - - 2: - EE.VLD.128.IP q2, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VPRELU.S8 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8 q5, q2, q3 - - EE.VPRELU.S8 q5, q5, q6, a15 - EE.VST.128.IP q5, a2, 16 - retw - - 3: - EE.VLD.128.IP q2, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8.LD.INCP q3, a4, q4, q0, q1 - EE.VPRELU.S8 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8.LD.INCP q1, a4, q5, q2, q3 - EE.VPRELU.S8 q5, q5, q6, a15 - EE.VST.128.IP q5, a2, 16 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8 q4, q0, q1 - - EE.VPRELU.S8 q4, q4, q6, a15 - EE.VST.128.IP q4, a2, 16 - retw - - -dl_tie728_s8_sub2d_prelu_small_channel: # channel < 3*s - - loopgtz a7, 0f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - EE.VLD.128.IP q3, a14, 16 - EE.VSUBS.S8 q2, q0, q1 - - EE.VPRELU.S8 q2, q2, q3, a15 - EE.VST.128.IP q2, a2, 16 - 0: - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a4, 16 - - EE.VLD.128.IP q3, a14, 16 - EE.VSUBS.S8 q2, q0, q1 - - EE.VPRELU.S8 q2, q2, q3, a15 - EE.VST.128.IP q2, a2, 16 - retw - - - - - .align 4 - .text - .global dl_tie728_s8_rescale_sub2d_11c_prelu - .type dl_tie728_s8_rescale_sub2d_11c_prelu, @function - .section .iram1 -dl_tie728_s8_rescale_sub2d_11c_prelu: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr: >> shift or *scale) >> shift - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr: the one need to be rescaled - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a12: rescale_input - # a14: activation_alpha_ptr - # a15: activation_shift - - - l32i a6, a5, 64 - l32i a7, a5, 88 - l32i a8, a5, 96 - l32i a9, a5, 92 - l32i a12, a5, 80 - l32i a14, a5, 56 - l32i a15, a5, 60 - - - beqi a8, 1, dl_tie728_s8_rescale_sub2d_output_prelu - - -dl_tie728_s8_rescale_sub2d_output_scale_prelu: # *scale) >> shift - - s8i a8, a1, 0 - EE.VLDBC.8 q7, a1 # all output_scale - - loopgtz a6, dl_tie728_s8_rescale_sub2d_11c_output_prelu - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.VLD.128.IP q5, a14, 16 - dl_tie728_rescale_sub_rescale_output q0, q1, q1, q7, a9, q4, a12 - - EE.VPRELU.S8 q1, q1, q5, a15 - EE.VST.128.IP q1, a2, 16 - dl_tie728_s8_rescale_sub2d_11c_output_prelu: - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.VLD.128.IP q5, a14, 16 - dl_tie728_rescale_sub_rescale_output q0, q1, q1, q7, a9, q4, a12 - - EE.VPRELU.S8 q1, q1, q5, a15 - EE.VST.128.IP q1, a2, 16 - - retw - - - -dl_tie728_s8_rescale_sub2d_output_prelu: # >> shift - movi a13, -1 - s8i a13, a1, 0 - EE.VLDBC.8 q7, a1 # all -1 - - blti a12, 2, dl_tie728_s8_rescale_sub2d_output_prelu_0 - -# input1 in the front - EE.LDQA.S8.128.IP a4, 16 - EE.VLD.128.IP q0, a3, 16 - loopgtz a6, dl_tie728_s8_rescale_sub2d_11c_prelu_1 - EE.SRCMB.S8.QACC q1, a7, 0 - EE.VMULAS.S8.QACC.LD.IP q0, a3, 16, q0, q7 # input1 >> shift - input0 - EE.SRCMB.S8.QACC q1, a9, 0 - - EE.VLD.128.IP q6, a14, 16 - EE.LDQA.S8.128.IP a4, 16 - EE.VPRELU.S8 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 - dl_tie728_s8_rescale_sub2d_11c_prelu_1: - - EE.SRCMB.S8.QACC q1, a7, 0 - EE.VMULAS.S8.QACC.LD.IP q0, a3, 16, q0, q7 # input1 >> shift - input0 - - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S8.QACC q1, a9, 0 - - EE.VPRELU.S8 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 - retw - - -# input0 in the front -dl_tie728_s8_rescale_sub2d_output_prelu_0: - - EE.LDQA.S8.128.IP a4, 16 - loopgtz a6, dl_tie728_s8_rescale_sub2d_11c_prelu_0 - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.LDQA.S8.128.IP a3, 16 - EE.VMULAS.S8.QACC q1, q7 # input0 - input1 >> shift - EE.SRCMB.S8.QACC q1, a9, 0 - - EE.VLD.128.IP q6, a14, 16 - EE.LDQA.S8.128.IP a4, 16 - EE.VPRELU.S8 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 - dl_tie728_s8_rescale_sub2d_11c_prelu_0: - - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.LDQA.S8.128.IP a3, 16 - EE.VMULAS.S8.QACC q1, q7 # input0 - input1 >> shift - - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S8.QACC q1, a9, 0 - - EE.VPRELU.S8 q1, q1, q6, a15 - EE.VST.128.IP q1, a2, 16 - retw - - - - - - - - - -############################################################################################################################################################ -#### -#### tie728_s8_unaligned_sub2d_11c series -#### -############################################################################################################################################################ - - .align 4 - .text - .global dl_tie728_s8_unaligned_sub2d_11c - .type dl_tie728_s8_unaligned_sub2d_11c, @function - .section .iram1 -dl_tie728_s8_unaligned_sub2d_11c: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a10: c_remainder - # a12: rescale_input - - l32i a6, a5, 64 - l32i a10, a5, 76 - l32i a7, a5, 88 - l32i a12, a5, 80 - - bgei a7, 0, dl_tie728_s8_unaligned_rescale_sub2d - -# input0 exp = input1 exp = output exp - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie728_s8_unaligned_sub2d_11c_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie728_s8_unaligned_sub2d_11c_0 - beqi a13, 8, dl_tie728_s8_unaligned_sub2d_11c_1 - - - loopgtz a6, 0f #dl_tie728_s8_unaligned_sub2d_11c - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.LD.128.USAR.IP q1, a3, 16 - - EE.VSUBS.S8 q2, q2, q5 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S8 q2, q2, q5 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie728_s8_unaligned_sub2d_11c_remainder - - #output sar = 0 - dl_tie728_s8_unaligned_sub2d_11c_0: - loopgtz a6, 1f #dl_tie728_s8_unaligned_sub2d_11c_loop0 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.LD.128.USAR.IP q1, a3, 16 - - EE.VSUBS.S8 q2, q2, q5 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S8 q2, q2, q5 - EE.VST.128.IP q2, a2, 16 - j dl_tie728_s8_unaligned_sub2d_11c_remainder - - # #output sar = 8 - dl_tie728_s8_unaligned_sub2d_11c_1: - loopgtz a6, 2f #dl_tie728_s8_unaligned_sub2d_11c_loop1 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.LD.128.USAR.IP q1, a3, 16 - - EE.VSUBS.S8 q2, q2, q5 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S8 q2, q2, q5 - dl_tie728_s8_unaligned_store1 q2, a2 - j dl_tie728_s8_unaligned_sub2d_11c_remainder - -dl_tie728_s8_unaligned_sub2d_11c_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 - -dl_tie728_s8_unaligned_sub2d_11c_remainder: - - beqz a10, dl_tie728_s8_unaligned_sub2d_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VSUBS.S8 q2, q2, q5 - - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_sub2d_end: - - retw - - -# rescaled sub -dl_tie728_s8_unaligned_rescale_sub2d: - l32i a8, a5, 96 # output_scale - l32i a9, a5, 92 # output_shift - - beqi a8, 1, dl_tie728_s8_rescale_unaligned_sub2d_output_shift - - -# rescaled to output by *scale) >> shift -dl_tie728_s8_rescale_unaligned_sub2d_output_scale: - - s8i a8, a1, 0 - EE.VLDBC.8 q7, a1 # all output_scale - - blti a6, 0, dl_tie728_s8_unaligned_sub2d_scale_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 3f #dl_tie728_s8_rescale_unaligned_sub2d_11c_scale - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_sub_rescale_output q2, q1, q2, q7, a9, q4, a12 - - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_s8_unaligned_store0 q2, a2, a11 - 3: - - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a6 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_sub_rescale_output q2, q1, q2, q7, a9, q4, a12 - - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie728_s8_unaligned_sub2d_scale_remainder - - -dl_tie728_s8_unaligned_sub2d_scale_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a6 - -dl_tie728_s8_unaligned_sub2d_scale_remainder: - beqz a10, dl_tie728_s8_unaligned_rescale_sub2d_output_scale_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a6 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_sub_rescale_output q2, q1, q2, q7, a9, q4, a12 - - # dl_tie728_s8_unaligned_store0 q2, a2, a9 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_rescale_sub2d_output_scale_end: - retw - -# rescaled to output by right shift -dl_tie728_s8_rescale_unaligned_sub2d_output_shift: - movi a13, -1 - s8i a13, a1, 0 - EE.VLDBC.8 q7, a1 # all -1 - - blti a6, 0, dl_tie728_s8_unaligned_sub2d_shift_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 4f #dl_tie728_s8_rescale_unaligned_sub2d_11c_shift - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S8.QACC q5 - - EE.SRCMB.S8.QACC q5, a7, 0 # input1 >> shift - - blti a12, 2, 11f - EE.VMULAS.S8.QACC q2, q7 # input1 >> shift - input0 - j 12f - 11: - EE.MOV.S8.QACC q2 - EE.VMULAS.S8.QACC q5, q7 # input0 - input1 >> shift - 12: - - EE.SRCMB.S8.QACC q5, a9, 0 - - EE.LD.128.USAR.IP q1, a3, 16 - dl_tie728_s8_unaligned_store0 q5, a2, a13 - - 4: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a6 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S8.QACC q2, q7 # input1 >> shift - input0 - j 12f - 11: - EE.MOV.S8.QACC q2 - EE.VMULAS.S8.QACC q5, q7 # input0 - input1 >> shift - 12: - EE.SRCMB.S8.QACC q5, a9, 0 - - dl_tie728_s8_unaligned_store0 q5, a2, a13 - j dl_tie728_s8_unaligned_sub2d_shift_remainder - - -dl_tie728_s8_unaligned_sub2d_shift_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a6 - -dl_tie728_s8_unaligned_sub2d_shift_remainder: - beqz a10, dl_tie728_s8_unaligned_rescale_sub2d_output_shift_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a6 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S8.QACC q2, q7 # input1 >> shift - input0 - j 12f -11: - EE.MOV.S8.QACC q2 - EE.VMULAS.S8.QACC q5, q7 # input0 - input1 >> shift -12: - EE.SRCMB.S8.QACC q5, a9, 0 - - # dl_tie728_s8_unaligned_store0 q5, a2, a13 - dl_tie728_s8_store_remainder q5, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_rescale_sub2d_output_shift_end: - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_sub2d_11c_relu - .type dl_tie728_s8_unaligned_sub2d_11c_relu, @function - .section .iram1 -dl_tie728_s8_unaligned_sub2d_11c_relu: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a10: c_remainder - # a12: rescale_input - # a14: activation_alpha - # a15: activation_shift - - l32i a6, a5, 64 - l32i a10, a5, 76 - l32i a7, a5, 88 - l32i a12, a5, 80 - l32i a14, a5, 52 - l32i a15, a5, 60 - - bgei a7, 0, dl_tie728_s8_unaligned_rescale_sub2d_relu - -# input0 exp = input1 exp = output exp - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie728_s8_unaligned_sub2d_11c_relu_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie728_s8_unaligned_sub2d_11c_relu_0 - beqi a13, 8, dl_tie728_s8_unaligned_sub2d_11c_relu_1 - - - loopgtz a6, 0f #dl_tie728_s8_unaligned_sub2d_11c - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.LD.128.USAR.IP q1, a3, 16 - - EE.VSUBS.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie728_s8_unaligned_sub2d_11c_relu_remainder - - #output sar = 0 - dl_tie728_s8_unaligned_sub2d_11c_relu_0: - loopgtz a6, 1f #dl_tie728_s8_unaligned_sub2d_11c_loop0 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.LD.128.USAR.IP q1, a3, 16 - - EE.VSUBS.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie728_s8_unaligned_sub2d_11c_relu_remainder - - # #output sar = 8 - dl_tie728_s8_unaligned_sub2d_11c_relu_1: - loopgtz a6, 2f #dl_tie728_s8_unaligned_sub2d_11c_loop1 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.LD.128.USAR.IP q1, a3, 16 - - EE.VSUBS.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VSUBS.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - j dl_tie728_s8_unaligned_sub2d_11c_relu_remainder - -dl_tie728_s8_unaligned_sub2d_11c_relu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 - -dl_tie728_s8_unaligned_sub2d_11c_relu_remainder: - - beqz a10, dl_tie728_s8_unaligned_sub2d_relu_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VSUBS.S8 q2, q2, q5 - EE.VRELU.S8 q2, a14, a15 - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_sub2d_relu_end: - - retw - - -# rescaled sub -dl_tie728_s8_unaligned_rescale_sub2d_relu: - l32i a8, a5, 96 # output_scale - l32i a9, a5, 92 # output_shift - - beqi a8, 1, dl_tie728_s8_rescale_unaligned_sub2d_output_shift_relu - - -# rescaled to output by *scale) >> shift -dl_tie728_s8_rescale_unaligned_sub2d_output_scale_relu: - - s8i a8, a1, 0 - EE.VLDBC.8 q7, a1 # all output_scale - - blti a6, 0, dl_tie728_s8_unaligned_sub2d_scale_relu_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 3f #dl_tie728_s8_rescale_unaligned_sub2d_11c_scale - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_sub_rescale_output q2, q1, q2, q7, a9, q4, a12 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a11 - 3: - - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a6 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_sub_rescale_output q2, q1, q2, q7, a9, q4, a12 - - EE.VRELU.S8 q2, a14, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie728_s8_unaligned_sub2d_scale_relu_remainder - - -dl_tie728_s8_unaligned_sub2d_scale_relu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a6 - -dl_tie728_s8_unaligned_sub2d_scale_relu_remainder: - beqz a10, dl_tie728_s8_unaligned_rescale_sub2d_output_scale_relu_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a6 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_sub_rescale_output q2, q1, q2, q7, a9, q4, a12 - - EE.VRELU.S8 q2, a14, a15 - # dl_tie728_s8_unaligned_store0 q2, a2, a9 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_rescale_sub2d_output_scale_relu_end: - retw - -# rescaled to output by right shift -dl_tie728_s8_rescale_unaligned_sub2d_output_shift_relu: - movi a13, -1 - s8i a13, a1, 0 - EE.VLDBC.8 q7, a1 # all -1 - - blti a6, 0, dl_tie728_s8_unaligned_sub2d_shift_relu_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 4f #dl_tie728_s8_rescale_unaligned_sub2d_11c_shift - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S8.QACC q5 - - EE.SRCMB.S8.QACC q5, a7, 0 # input1 >> shift - - blti a12, 2, 11f - EE.VMULAS.S8.QACC q2, q7 # input1 >> shift - input0 - j 12f - 11: - EE.MOV.S8.QACC q2 - EE.VMULAS.S8.QACC q5, q7 # input0 - input1 >> shift - 12: - - EE.SRCMB.S8.QACC q5, a9, 0 - - EE.LD.128.USAR.IP q1, a3, 16 - EE.VRELU.S8 q5, a14, a15 - dl_tie728_s8_unaligned_store0 q5, a2, a13 - - 4: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a6 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S8.QACC q2, q7 # input1 >> shift - input0 - j 12f - 11: - EE.MOV.S8.QACC q2 - EE.VMULAS.S8.QACC q5, q7 # input0 - input1 >> shift - 12: - EE.SRCMB.S8.QACC q5, a9, 0 - EE.VRELU.S8 q5, a14, a15 - dl_tie728_s8_unaligned_store0 q5, a2, a13 - j dl_tie728_s8_unaligned_sub2d_shift_relu_remainder - - -dl_tie728_s8_unaligned_sub2d_shift_relu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a6 - -dl_tie728_s8_unaligned_sub2d_shift_relu_remainder: - beqz a10, dl_tie728_s8_unaligned_rescale_sub2d_output_shift_relu_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a6 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S8.QACC q2, q7 # input1 >> shift - input0 - j 12f -11: - EE.MOV.S8.QACC q2 - EE.VMULAS.S8.QACC q5, q7 # input0 - input1 >> shift -12: - EE.SRCMB.S8.QACC q5, a9, 0 - EE.VRELU.S8 q5, a14, a15 - # dl_tie728_s8_unaligned_store0 q5, a2, a13 - dl_tie728_s8_store_remainder q5, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_rescale_sub2d_output_shift_relu_end: - retw - - - - - - .align 4 - .text - .global dl_tie728_s8_unaligned_sub2d_11c_prelu - .type dl_tie728_s8_unaligned_sub2d_11c_prelu, @function - .section .iram1 -dl_tie728_s8_unaligned_sub2d_11c_prelu: - .align 4 - entry sp, 32 - - # a2: int8_t *output_ptr - # a3: int8_t *input0_ptr - # a4: int8_t *input1_ptr - # a5: void *args - # a6: c_div_x_1 - # a7: input_shift - # a8: output_scale - # a9: output_shift - # a10: c_remainder - # a12: rescale_input - # a14: activation_alpha_ptr - # a15: activation_shift - - l32i a6, a5, 64 - l32i a10, a5, 76 - l32i a7, a5, 88 - l32i a12, a5, 80 - l32i a14, a5, 56 - l32i a15, a5, 60 - - bgei a7, 0, dl_tie728_s8_unaligned_rescale_sub2d_prelu - -# input0 exp = input1 exp = output exp - - EE.LD.128.USAR.IP q5, a2, 0 #get output_ptr sar_byte - rur.sar_byte a13 - - blti a6, 0, dl_tie728_s8_unaligned_sub2d_11c_prelu_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - beqi a13, 0, dl_tie728_s8_unaligned_sub2d_11c_prelu_0 - beqi a13, 8, dl_tie728_s8_unaligned_sub2d_11c_prelu_1 - - - loopgtz a6, 0f #dl_tie728_s8_unaligned_sub2d_11c - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.LD.128.USAR.IP q1, a3, 16 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - 0: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie728_s8_unaligned_sub2d_11c_prelu_remainder - - #output sar = 0 - dl_tie728_s8_unaligned_sub2d_11c_prelu_0: - loopgtz a6, 1f #dl_tie728_s8_unaligned_sub2d_11c_loop0 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.LD.128.USAR.IP q1, a3, 16 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 - 1: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - EE.VST.128.IP q2, a2, 16 - j dl_tie728_s8_unaligned_sub2d_11c_prelu_remainder - - # #output sar = 8 - dl_tie728_s8_unaligned_sub2d_11c_prelu_1: - loopgtz a6, 2f #dl_tie728_s8_unaligned_sub2d_11c_loop1 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.LD.128.USAR.IP q1, a3, 16 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - 2: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a12 - EE.SRC.Q.QUP q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store1 q2, a2 - j dl_tie728_s8_unaligned_sub2d_11c_prelu_remainder - -dl_tie728_s8_unaligned_sub2d_11c_prelu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a12 - -dl_tie728_s8_unaligned_sub2d_11c_prelu_remainder: - - beqz a10, dl_tie728_s8_unaligned_sub2d_prelu_end - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a12 - EE.SRC.Q q5, q3, q4 - - EE.VLD.128.IP q6, a14, 16 - EE.VSUBS.S8 q2, q2, q5 - EE.VPRELU.S8 q2, q2, q6, a15 - # dl_tie728_s8_unaligned_store0 q2, a2, a13 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_sub2d_prelu_end: - - retw - - -# rescaled sub -dl_tie728_s8_unaligned_rescale_sub2d_prelu: - l32i a8, a5, 96 # output_scale - l32i a9, a5, 92 # output_shift - - beqi a8, 1, dl_tie728_s8_rescale_unaligned_sub2d_output_shift_prelu - - -# rescaled to output by *scale) >> shift -dl_tie728_s8_rescale_unaligned_sub2d_output_scale_prelu: - - s8i a8, a1, 0 - EE.VLDBC.8 q7, a1 # all output_scale - - blti a6, 0, dl_tie728_s8_unaligned_sub2d_scale_prelu_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 3f #dl_tie728_s8_rescale_unaligned_sub2d_11c_scale - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - dl_tie728_rescale_sub_rescale_output q2, q1, q2, q7, a9, q4, a12 - - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a11 - 3: - - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a6 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.VLD.128.IP q6, a14, 16 - dl_tie728_rescale_sub_rescale_output q2, q1, q2, q7, a9, q4, a12 - - EE.VPRELU.S8 q2, q2, q6, a15 - dl_tie728_s8_unaligned_store0 q2, a2, a13 - j dl_tie728_s8_unaligned_sub2d_scale_prelu_remainder - - -dl_tie728_s8_unaligned_sub2d_scale_prelu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a6 - -dl_tie728_s8_unaligned_sub2d_scale_prelu_remainder: - beqz a10, dl_tie728_s8_unaligned_rescale_sub2d_output_scale_prelu_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a6 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q1, a7, 0 - - EE.VLD.128.IP q6, a14, 16 - dl_tie728_rescale_sub_rescale_output q2, q1, q2, q7, a9, q4, a12 - - EE.VPRELU.S8 q2, q2, q6, a15 - # dl_tie728_s8_unaligned_store0 q2, a2, a9 - dl_tie728_s8_store_remainder q2, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_rescale_sub2d_output_scale_prelu_end: - retw - -# rescaled to output by right shift -dl_tie728_s8_rescale_unaligned_sub2d_output_shift_prelu: - movi a13, -1 - s8i a13, a1, 0 - EE.VLDBC.8 q7, a1 # all -1 - - blti a6, 0, dl_tie728_s8_unaligned_sub2d_shift_prelu_small_remainder # channel < 16 - - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q3, a4, 16 - EE.LD.128.USAR.IP q1, a3, 16 - - loopgtz a6, 4f #dl_tie728_s8_rescale_unaligned_sub2d_11c_shift - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 16 - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S8.QACC q5 - - EE.SRCMB.S8.QACC q5, a7, 0 # input1 >> shift - - blti a12, 2, 11f - EE.VMULAS.S8.QACC q2, q7 # input1 >> shift - input0 - j 12f - 11: - EE.MOV.S8.QACC q2 - EE.VMULAS.S8.QACC q5, q7 # input0 - input1 >> shift - 12: - - EE.SRCMB.S8.QACC q5, a9, 0 - EE.VLD.128.IP q6, a14, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.VPRELU.S8 q5, q5, q6, a15 - dl_tie728_s8_unaligned_store0 q5, a2, a13 - - 4: - addi a3, a3, -16 - add a3, a3, a10 - rur.sar_byte a11 #input0 sar - EE.SRC.Q.QUP q2, q0, q1 - - EE.LD.128.USAR.XP q4, a4, a10 - rur.sar_byte a6 #input1 sar - EE.SRC.Q.QUP q5, q3, q4 - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S8.QACC q2, q7 # input1 >> shift - input0 - j 12f - 11: - EE.MOV.S8.QACC q2 - EE.VMULAS.S8.QACC q5, q7 # input0 - input1 >> shift - 12: - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S8.QACC q5, a9, 0 - EE.VPRELU.S8 q5, q5, q6, a15 - dl_tie728_s8_unaligned_store0 q5, a2, a13 - j dl_tie728_s8_unaligned_sub2d_shift_prelu_remainder - - -dl_tie728_s8_unaligned_sub2d_shift_prelu_small_remainder: - EE.LD.128.USAR.XP q0, a3, a10 - rur.sar_byte a11 - - EE.LD.128.USAR.XP q3, a4, a10 - rur.sar_byte a6 - -dl_tie728_s8_unaligned_sub2d_shift_prelu_remainder: - beqz a10, dl_tie728_s8_unaligned_rescale_sub2d_output_shift_prelu_end # c remainder - - EE.LD.128.USAR.IP q1, a3, 0 - wur.sar_byte a11 - EE.SRC.Q q2, q0, q1 - - EE.LD.128.USAR.IP q4, a4, 0 - wur.sar_byte a6 - EE.SRC.Q q5, q3, q4 - - EE.MOV.S8.QACC q5 - EE.SRCMB.S8.QACC q5, a7, 0 - - blti a12, 2, 11f - EE.VMULAS.S8.QACC q2, q7 # input1 >> shift - input0 - j 12f -11: - EE.MOV.S8.QACC q2 - EE.VMULAS.S8.QACC q5, q7 # input0 - input1 >> shift -12: - EE.VLD.128.IP q6, a14, 16 - EE.SRCMB.S8.QACC q5, a9, 0 - EE.VPRELU.S8 q5, q5, q6, a15 - # dl_tie728_s8_unaligned_store0 q5, a2, a13 - dl_tie728_s8_store_remainder q5, a9, a11, a12, a13, a2, a10 - - dl_tie728_s8_unaligned_rescale_sub2d_output_shift_prelu_end: - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/xtensa/dl_xtensa_s16_block.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/xtensa/dl_xtensa_s16_block.S deleted file mode 100644 index 0bbeee7b..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/base/isa/xtensa/dl_xtensa_s16_block.S +++ /dev/null @@ -1,796 +0,0 @@ -############################################################################################################################################################ -# xtensa general tools -############################################################################################################################################################ -.macro xtensa_clear_accumulator zero - # zero: any a-register equals 0 - - wsr \zero, acchi - wsr \zero, acclo -.endm - - -.macro xtensa_fetch_accumulator output temp - # output: output value - # temp: temporary variable - - rsr \output, acchi - rsr \temp, acclo - - src \output, \output, \temp -.endm - - -.macro xtensa_saturation x - # x: any a-register - - clamps \x, \x, 15 -.endm - - - -.macro xtensa_bias x bias_address temp - # x: a variable to be added - # bias_address: - # temp: a temporary variable keep bias element value - - l16si \temp, \bias_address, 0 - addi \bias_address, \bias_address, 2 - add \x, \x, \temp -.endm - - -.macro xtensa_store output_address x - # output_address: - # x: a variable to be stored - - s16i \x, \output_address, 0 - addi \output_address, \output_address, 2 -.endm - - -.macro xtensa_relu x zero - # x: varaible - # zero: any a-register equals to 0 - - max \x, \x, \zero -.endm - - - -############################################################################################################################################################ -# xtensa_s16_conv2d_11c1 -############################################################################################################################################################ -.macro xtensa_s16_conv2d_11c1 input__v0 input__v1 filter_v0 filter_v1 input__ptr filter_ptr c c_rs2_1 - # input__v0: 2 input elements - # input__v1: 2 input elements - # filter_v0: 2 filter elements - # filter_v1: 2 filter elements - # input__ptr: - # filter_ptr: - # c: input_channel - # c_rs2_1: input_channel >> 2 - 1 - - bgei \c, 4, 5f - beqi \c, 3, 3f - beqi \c, 2, 2f - # c == 1 - ldinc \input__v0, \input__ptr - ldinc \filter_v0, \filter_ptr - mula.dd.ll \input__v0, \filter_v0 - addi \input__ptr, \input__ptr, -2 - addi \filter_ptr, \filter_ptr, -2 - j 1f -2: - # c == 2 - ldinc \input__v0, \input__ptr - ldinc \filter_v0, \filter_ptr - mula.dd.ll \input__v0, \filter_v0 - mula.dd.hh \input__v0, \filter_v0 - j 1f -3: - # c == 3 - ldinc \input__v0, \input__ptr - ldinc \filter_v0, \filter_ptr - ldinc \input__v1, \input__ptr - mula.dd.ll.ldinc \filter_v1, \filter_ptr, \input__v0, \filter_v0 - mula.dd.hh \input__v0, \filter_v0 - mula.dd.ll \input__v1, \filter_v1 - addi \input__ptr, \input__ptr, -2 - addi \filter_ptr, \filter_ptr, -2 - j 1f - -5: - # c >= 4 - ldinc \input__v0, \input__ptr - ldinc \filter_v0, \filter_ptr - ldinc \input__v1, \input__ptr - mula.dd.ll.ldinc \filter_v1, \filter_ptr, \input__v0, \filter_v0 - - loopgtz \c_rs2_1, 4f - mula.dd.hh.ldinc \input__v0, \input__ptr, \input__v0, \filter_v0 - mula.dd.ll.ldinc \filter_v0, \filter_ptr, \input__v1, \filter_v1 - mula.dd.hh.ldinc \input__v1, \input__ptr, \input__v1, \filter_v1 - mula.dd.ll.ldinc \filter_v1, \filter_ptr, \input__v0, \filter_v0 -4: - mula.dd.hh \input__v0, \filter_v0 - mula.dd.ll \input__v1, \filter_v1 - mula.dd.hh \input__v1, \filter_v1 - - bbci \c, 1, 6f - # c % 4 == 2 or 3 - ldinc \input__v0, \input__ptr - ldinc \filter_v0, \filter_ptr - mula.dd.hh \input__v0, \filter_v0 - mula.dd.ll \input__v0, \filter_v0 -6: - bbci \c, 0, 1f - # c % 2 == 1 - ldinc \input__v0, \input__ptr - ldinc \filter_v0, \filter_ptr - mula.dd.ll \input__v0, \filter_v0 - addi \input__ptr, \input__ptr, -2 - addi \filter_ptr, \filter_ptr, -2 -1: -.endm - - - -.macro xtensa_load_args input__ptr args filter_ptr c n c_rs2_1 mac_shift - l32i \c, \args, 4 // input_channel - l32i \n, \args, 36 // output_channel - l32i \filter_ptr, \args, 48 // filter - l32i \mac_shift, \args, 64 // mac_shift - l32i \c_rs2_1, \args, 92 // input_channel >> 2 - 1 - addi \filter_ptr, \filter_ptr, -4 // ldinc will bump up pointer first then load - addi \input__ptr, \input__ptr, -4 // ldinc will bump up pointer first then load - ssr \mac_shift - movi \mac_shift, 0 -.endm - - - - - .align 4 - .text - .global dl_xtensa_s16_conv2d_11cn_bias - .type dl_xtensa_s16_conv2d_11cn_bias, @function - .section .iram1 -dl_xtensa_s16_conv2d_11cn_bias: - .align 4 - entry sp, 16 - - # a2: int16_t *output_address - # a3: int16_t *input__ptr - # a4: void *args - - # a5: filter_ptr - # a6: c - # a7: n - # a8: c_rs2_1 - # a9: mac_shift (after set srr a9 will be 0) - xtensa_load_args a3, a4, a5, a6, a7, a8, a9 - # a10: bias_address - # a11: - # a12: - # a13: output variable - # a14: temporary variable - # a15: moving_input_address - l32i a10, a4, 68 # bias_address -xtensa_s16_conv2d_11cn_bias_loop: - mov a15, a3 # reload input__ptr - - xtensa_clear_accumulator a9 - - xtensa_s16_conv2d_11c1 m0, m1, m2, m3, a15, a5, a6, a8 - - xtensa_fetch_accumulator a13, a14 - xtensa_bias a13, a10, a14 - # xtensa_relu a13, a9 - xtensa_saturation a13 - xtensa_store a2, a13 - - addi a7, a7, -1 # bump up pointer - bnez a7, xtensa_s16_conv2d_11cn_bias_loop - retw - - - - - .align 4 - .text - .global dl_xtensa_s16_conv2d_11cn_bias_relu - .type dl_xtensa_s16_conv2d_11cn_bias_relu, @function - .section .iram1 -dl_xtensa_s16_conv2d_11cn_bias_relu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_address - # a3: int16_t *input__ptr - # a4: void *args - - # a5: filter_ptr - # a6: c - # a7: n - # a8: c_rs2_1 - # a9: mac_shift (after set srr a9 will be 0) - xtensa_load_args a3, a4, a5, a6, a7, a8, a9 - # a10: bias_address - # a11: - # a12: - # a13: output variable - # a14: temporary variable - # a15: moving_input_address - l32i a10, a4, 68 # bias_address -xtensa_s16_conv2d_11cn_bias_relu_loop: - mov a15, a3 # reload input__ptr - - xtensa_clear_accumulator a9 - - xtensa_s16_conv2d_11c1 m0, m1, m2, m3, a15, a5, a6, a8 - - xtensa_fetch_accumulator a13, a14 - xtensa_bias a13, a10, a14 - xtensa_relu a13, a9 - xtensa_saturation a13 - xtensa_store a2, a13 - - addi a7, a7, -1 # bump up pointer - bnez a7, xtensa_s16_conv2d_11cn_bias_relu_loop - retw - - - - - .align 4 - .text - .global dl_xtensa_s16_conv2d_11cn - .type dl_xtensa_s16_conv2d_11cn, @function - .section .iram1 -dl_xtensa_s16_conv2d_11cn: - .align 4 - entry sp, 16 - - # a2: int16_t *output_address - # a3: int16_t *input__ptr - # a4: void *args - - # a5: filter_ptr - # a6: c - # a7: n - # a8: c_rs2_1 - # a9: mac_shift (after set srr a9 will be 0) - xtensa_load_args a3, a4, a5, a6, a7, a8, a9 - # a10: bias_address - # a11: - # a12: - # a13: output variable - # a14: temporary variable - # a15: moving_input_address - # l32i a10, a4, 68 # bias_address -xtensa_s16_conv2d_11cn_loop: - mov a15, a3 # reload input__ptr - - xtensa_clear_accumulator a9 - - xtensa_s16_conv2d_11c1 m0, m1, m2, m3, a15, a5, a6, a8 - - xtensa_fetch_accumulator a13, a14 - # xtensa_bias a13, a10, a14 - # xtensa_relu a13, a9 - xtensa_saturation a13 - xtensa_store a2, a13 - - addi a7, a7, -1 # bump up pointer - bnez a7, xtensa_s16_conv2d_11cn_loop - retw - - - - - .align 4 - .text - .global dl_xtensa_s16_conv2d_11cn_relu - .type dl_xtensa_s16_conv2d_11cn_relu, @function - .section .iram1 -dl_xtensa_s16_conv2d_11cn_relu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_address - # a3: int16_t *input__ptr - # a4: void *args - - # a5: filter_ptr - # a6: c - # a7: n - # a8: c_rs2_1 - # a9: mac_shift (after set srr a9 will be 0) - xtensa_load_args a3, a4, a5, a6, a7, a8, a9 - # a10: bias_address - # a11: - # a12: - # a13: output variable - # a14: temporary variable - # a15: moving_input_address - # l32i a10, a4, 68 # bias_address -xtensa_s16_conv2d_11cn_relu_loop: - mov a15, a3 # reload input__ptr - - xtensa_clear_accumulator a9 - - xtensa_s16_conv2d_11c1 m0, m1, m2, m3, a15, a5, a6, a8 - - xtensa_fetch_accumulator a13, a14 - # xtensa_bias a13, a10, a14 - xtensa_relu a13, a9 - xtensa_saturation a13 - xtensa_store a2, a13 - - addi a7, a7, -1 # bump up pointer - bnez a7, xtensa_s16_conv2d_11cn_relu_loop - retw - - - -############################################################################################################################################################ -# xtensa_s16_conv2d_33c1 -############################################################################################################################################################ -.macro xtensa_s16_conv2d_33c1 input__v0 input__v1 filter_v0 filter_v1 input__ptr filter_ptr c c_rs2_1 dilation_x_offset dilation_y_offset - # input__v0: 2 input elements - # input__v1: 2 input elements - # filter_v0: 2 filter elements - # filter_v1: 2 filter elements - # input__ptr: - # filter_ptr: - # c: input_channel - # c_rs2_1: input_channel >> 2 - 1 - # dilation_x_offset: (dilation_x * input_channel_with_padding - input_channel) * sizeof(output_t) - # dilation_y_offset: (dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1)) * sizeof(output_t) - - xtensa_s16_conv2d_11c1 \input__v0 \input__v1 \filter_v0 \filter_v1 \input__ptr \filter_ptr \c \c_rs2_1 - add \input__ptr, \input__ptr, \dilation_x_offset # go to the next input__ptr - - xtensa_s16_conv2d_11c1 \input__v0 \input__v1 \filter_v0 \filter_v1 \input__ptr \filter_ptr \c \c_rs2_1 - add \input__ptr, \input__ptr, \dilation_x_offset # go to the next input__ptr - - xtensa_s16_conv2d_11c1 \input__v0 \input__v1 \filter_v0 \filter_v1 \input__ptr \filter_ptr \c \c_rs2_1 - add \input__ptr, \input__ptr, \dilation_y_offset # go to the next input__ptr - - xtensa_s16_conv2d_11c1 \input__v0 \input__v1 \filter_v0 \filter_v1 \input__ptr \filter_ptr \c \c_rs2_1 - add \input__ptr, \input__ptr, \dilation_x_offset # go to the next input__ptr - - xtensa_s16_conv2d_11c1 \input__v0 \input__v1 \filter_v0 \filter_v1 \input__ptr \filter_ptr \c \c_rs2_1 - add \input__ptr, \input__ptr, \dilation_x_offset # go to the next input__ptr - - xtensa_s16_conv2d_11c1 \input__v0 \input__v1 \filter_v0 \filter_v1 \input__ptr \filter_ptr \c \c_rs2_1 - add \input__ptr, \input__ptr, \dilation_y_offset # go to the next input__ptr - - xtensa_s16_conv2d_11c1 \input__v0 \input__v1 \filter_v0 \filter_v1 \input__ptr \filter_ptr \c \c_rs2_1 - add \input__ptr, \input__ptr, \dilation_x_offset # go to the next input__ptr - - xtensa_s16_conv2d_11c1 \input__v0 \input__v1 \filter_v0 \filter_v1 \input__ptr \filter_ptr \c \c_rs2_1 - add \input__ptr, \input__ptr, \dilation_x_offset # go to the next input_ptr - - xtensa_s16_conv2d_11c1 \input__v0 \input__v1 \filter_v0 \filter_v1 \input__ptr \filter_ptr \c \c_rs2_1 -.endm - - - -.macro xtensa_load_hwcn_args input__ptr args filter_ptr c n c_rs2_1 mac_shift dilation_x_offset dilation_y_offset - xtensa_load_args \input__ptr, \args, \filter_ptr, \c, \n, \c_rs2_1, \mac_shift - l32i \dilation_x_offset, \args, 108 // input dilation x offset - l32i \dilation_y_offset, \args, 112 // input dilation y offset -.endm - - - - .align 4 - .text - .global dl_xtensa_s16_conv2d_33cn_bias - .type dl_xtensa_s16_conv2d_33cn_bias, @function - .section .iram1 -dl_xtensa_s16_conv2d_33cn_bias: - .align 4 - entry sp, 16 - - # a2: int16_t *output_address - # a3: int16_t *input__ptr - # a4: void *args - - # a5: filter_ptr - # a6: c - # a7: n - # a8: c_rs2_1 - # a9: mac_shift (after set srr a9 will be 0) - # a10: bias_address - # a11: input dilation x offset - # a12: input dilation y offset - # a13: output variable - # a14: temporary variable - # a15: moving_input_address - xtensa_load_hwcn_args a3, a4, a5, a6, a7, a8, a9, a11, a12 - - l32i a10, a4, 68 # bias_address -xtensa_s16_conv2d_33cn_bias_loop: - mov a15, a3 # reload input__ptr - - xtensa_clear_accumulator a9 - - xtensa_s16_conv2d_33c1 m0, m1, m2, m3, a15, a5, a6, a8, a11, a12 - - xtensa_fetch_accumulator a13, a14 - xtensa_bias a13, a10, a14 - # xtensa_relu a13, a9 - xtensa_saturation a13 - xtensa_store a2, a13 - - addi a7, a7, -1 # bump up pointer - bnez a7, xtensa_s16_conv2d_33cn_bias_loop - retw - - - - .align 4 - .text - .global dl_xtensa_s16_conv2d_33cn_bias_relu - .type dl_xtensa_s16_conv2d_33cn_bias_relu, @function - .section .iram1 -dl_xtensa_s16_conv2d_33cn_bias_relu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_address - # a3: int16_t *input__ptr - # a4: void *args - - # a5: filter_ptr - # a6: c - # a7: n - # a8: c_rs2_1 - # a9: mac_shift (after set srr a9 will be 0) - # a10: bias_address - # a11: input dilation x offset - # a12: input dilation y offset - # a13: output variable - # a14: temporary variable - # a15: moving_input_address - xtensa_load_hwcn_args a3, a4, a5, a6, a7, a8, a9, a11, a12 - - l32i a10, a4, 68 # bias_address -xtensa_s16_conv2d_33cn_bias_relu_loop: - mov a15, a3 # reload input__ptr - - xtensa_clear_accumulator a9 - - xtensa_s16_conv2d_33c1 m0, m1, m2, m3, a15, a5, a6, a8, a11, a12 - - xtensa_fetch_accumulator a13, a14 - xtensa_bias a13, a10, a14 - xtensa_relu a13, a9 - xtensa_saturation a13 - xtensa_store a2, a13 - - addi a7, a7, -1 # bump up pointer - bnez a7, xtensa_s16_conv2d_33cn_bias_relu_loop - retw - - - - .align 4 - .text - .global dl_xtensa_s16_conv2d_33cn - .type dl_xtensa_s16_conv2d_33cn, @function - .section .iram1 -dl_xtensa_s16_conv2d_33cn: - .align 4 - entry sp, 16 - - # a2: int16_t *output_address - # a3: int16_t *input__ptr - # a4: void *args - - # a5: filter_ptr - # a6: c - # a7: n - # a8: c_rs2_1 - # a9: mac_shift (after set srr a9 will be 0) - # a10: bias_address - # a11: input dilation x offset - # a12: input dilation y offset - # a13: output variable - # a14: temporary variable - # a15: moving_input_address - xtensa_load_hwcn_args a3, a4, a5, a6, a7, a8, a9, a11, a12 - - # l32i a10, a4, 68 # bias_address -xtensa_s16_conv2d_33cn_loop: - mov a15, a3 # reload input__ptr - - xtensa_clear_accumulator a9 - - xtensa_s16_conv2d_33c1 m0, m1, m2, m3, a15, a5, a6, a8, a11, a12 - - xtensa_fetch_accumulator a13, a14 - # xtensa_bias a13, a10, a14 - # xtensa_relu a13, a9 - xtensa_saturation a13 - xtensa_store a2, a13 - - addi a7, a7, -1 # bump up pointer - bnez a7, xtensa_s16_conv2d_33cn_loop - retw - - - - .align 4 - .text - .global dl_xtensa_s16_conv2d_33cn_relu - .type dl_xtensa_s16_conv2d_33cn_relu, @function - .section .iram1 -dl_xtensa_s16_conv2d_33cn_relu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_address - # a3: int16_t *input__ptr - # a4: void *args - - # a5: filter_ptr - # a6: c - # a7: n - # a8: c_rs2_1 - # a9: mac_shift (after set srr a9 will be 0) - # a10: - # a11: input dilation x offset - # a12: input dilation y offset - # a13: output variable - # a14: temporary variable - # a15: moving_input_address - xtensa_load_hwcn_args a3, a4, a5, a6, a7, a8, a9, a11, a12 - - # l32i a10, a4, 68 # bias_address -xtensa_s16_conv2d_33cn_relu_loop: - mov a15, a3 # reload input__ptr - - xtensa_clear_accumulator a9 - - xtensa_s16_conv2d_33c1 m0, m1, m2, m3, a15, a5, a6, a8, a11, a12 - - xtensa_fetch_accumulator a13, a14 - # xtensa_bias a13, a10, a14 - xtensa_relu a13, a9 - xtensa_saturation a13 - xtensa_store a2, a13 - - addi a7, a7, -1 # bump up pointer - bnez a7, xtensa_s16_conv2d_33cn_relu_loop - retw - - - - -############################################################################################################################################################ -# xtensa_s16_conv2d_hwc1 -############################################################################################################################################################ -.macro xtensa_s16_conv2d_hwc1 input__v0 input__v1 filter_v0 filter_v1 input__ptr filter_ptr c c_rs2_1 dilation_x_offset dilation_y_offset filter_h filter_w args - # input__v0: 2 input elements - # input__v1: 2 input elements - # filter_v0: 2 filter elements - # filter_v1: 2 filter elements - # input__ptr: - # filter_ptr: - # c: input_channel - # c_rs2_1: input_channel >> 2 - 1 - # dilation_x_offset: dilation_x * input_channel_with_padding - input_channel - # dilation_y_offset: dilation_y * input_width_with_padding * input_channel_with_padding - input_channel - dilation_x * input_channel_with_padding * (filter_width - 1) - # filter_h: filter height - # filter_w: filter width - - l32i \filter_h, \args, 52 # filter_height - 7: - l32i \filter_w, \args, 56 # filter_width - beqi \filter_w, 1, 9f - 8: - xtensa_s16_conv2d_11c1 \input__v0 \input__v1 \filter_v0 \filter_v1 \input__ptr \filter_ptr \c \c_rs2_1 - add \input__ptr, \input__ptr, \dilation_x_offset - - addi \filter_w, \filter_w, -1 - bgei \filter_w, 2, 8b - 9: - xtensa_s16_conv2d_11c1 \input__v0 \input__v1 \filter_v0 \filter_v1 \input__ptr \filter_ptr \c \c_rs2_1 - l32i \filter_w, \args, 60 # filter_y_offset - add \input__ptr, \input__ptr, \dilation_y_offset - add \filter_ptr, \filter_ptr, \filter_w - - addi \filter_h, \filter_h, -1 - bnez \filter_h, 7b - - l32i \filter_h, \args, 144 # filter_n_offset - add \filter_ptr, \filter_ptr, \filter_h -.endm - - - - - .align 4 - .text - .global dl_xtensa_s16_conv2d_hwcn_bias - .type dl_xtensa_s16_conv2d_hwcn_bias, @function - .section .iram1 -dl_xtensa_s16_conv2d_hwcn_bias: - .align 4 - entry sp, 16 - - # a2: int16_t *output_address - # a3: int16_t *input__ptr - # a4: void *args - - # a5: filter_ptr - # a6: c - # a7: n - # a8: c_rs2_1 - # a9: mac_shift (after set srr a9 will be 0) - # a10: bias_address - # a11: input dilation x offset - # a12: input dilation y offset - # a13: filter_height, output variable - # a14: filter_width, temporary variable - # a15: moving_input_address - xtensa_load_hwcn_args a3, a4, a5, a6, a7, a8, a9, a11, a12 - - l32i a10, a4, 68 # bias_address -xtensa_s16_conv2d_hwcn_bias_loop: - mov a15, a3 # reload input__ptr - - xtensa_clear_accumulator a9 - - xtensa_s16_conv2d_hwc1 m0, m1, m2, m3, a15, a5, a6, a8, a11, a12, a13, a14, a4 - - xtensa_fetch_accumulator a13, a14 - xtensa_bias a13, a10, a14 - # xtensa_relu a13, a9 - xtensa_saturation a13 - xtensa_store a2, a13 - - addi a7, a7, -1 # bump up pointer - bnez a7, xtensa_s16_conv2d_hwcn_bias_loop - retw - - - - .align 4 - .text - .global dl_xtensa_s16_conv2d_hwcn_bias_relu - .type dl_xtensa_s16_conv2d_hwcn_bias_relu, @function - .section .iram1 -dl_xtensa_s16_conv2d_hwcn_bias_relu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_address - # a3: int16_t *input__ptr - # a4: void *args - - # a5: filter_ptr - # a6: c - # a7: n - # a8: c_rs2_1 - # a9: mac_shift (after set srr a9 will be 0) - # a10: bias_address - # a11: input dilation x offset - # a12: input dilation y offset - # a13: filter_height, output variable - # a14: filter_width, temporary variable - # a15: moving_input_address - xtensa_load_hwcn_args a3, a4, a5, a6, a7, a8, a9, a11, a12 - - l32i a10, a4, 68 # bias_address -xtensa_s16_conv2d_hwcn_bias_relu_loop: - mov a15, a3 # reload input__ptr - - xtensa_clear_accumulator a9 - - xtensa_s16_conv2d_hwc1 m0, m1, m2, m3, a15, a5, a6, a8, a11, a12, a13, a14, a4 - - xtensa_fetch_accumulator a13, a14 - xtensa_bias a13, a10, a14 - xtensa_relu a13, a9 - xtensa_saturation a13 - xtensa_store a2, a13 - - addi a7, a7, -1 # bump up pointer - bnez a7, xtensa_s16_conv2d_hwcn_bias_relu_loop - retw - - - - .align 4 - .text - .global dl_xtensa_s16_conv2d_hwcn - .type dl_xtensa_s16_conv2d_hwcn, @function - .section .iram1 -dl_xtensa_s16_conv2d_hwcn: - .align 4 - entry sp, 16 - - # a2: int16_t *output_address - # a3: int16_t *input__ptr - # a4: void *args - - # a5: filter_ptr - # a6: c - # a7: n - # a8: c_rs2_1 - # a9: mac_shift (after set srr a9 will be 0) - # a10: - # a11: input dilation x offset - # a12: input dilation y offset - # a13: filter_height, output variable - # a14: filter_width, temporary variable - # a15: moving_input_address - xtensa_load_hwcn_args a3, a4, a5, a6, a7, a8, a9, a11, a12 - - # l32i a10, a4, 68 # bias_address -xtensa_s16_conv2d_hwcn_loop: - mov a15, a3 # reload input__ptr - - xtensa_clear_accumulator a9 - - xtensa_s16_conv2d_hwc1 m0, m1, m2, m3, a15, a5, a6, a8, a11, a12, a13, a14, a4 - - xtensa_fetch_accumulator a13, a14 - # xtensa_bias a13, a10, a14 - # xtensa_relu a13, a9 - xtensa_saturation a13 - xtensa_store a2, a13 - - addi a7, a7, -1 # bump up pointer - bnez a7, xtensa_s16_conv2d_hwcn_loop - retw - - - - .align 4 - .text - .global dl_xtensa_s16_conv2d_hwcn_relu - .type dl_xtensa_s16_conv2d_hwcn_relu, @function - .section .iram1 -dl_xtensa_s16_conv2d_hwcn_relu: - .align 4 - entry sp, 16 - - # a2: int16_t *output_address - # a3: int16_t *input__ptr - # a4: void *args - - # a5: filter_ptr - # a6: c - # a7: n - # a8: c_rs2_1 - # a9: mac_shift (after set srr a9 will be 0) - # a10: bias_address - # a11: input dilation x offset - # a12: input dilation y offset - # a13: filter_height, output variable - # a14: filter_width, temporary variable - # a15: moving_input_address - xtensa_load_hwcn_args a3, a4, a5, a6, a7, a8, a9, a11, a12 - - # l32i a10, a4, 68 # bias_address -xtensa_s16_conv2d_hwcn_relu_loop: - mov a15, a3 # reload input__ptr - - xtensa_clear_accumulator a9 - - xtensa_s16_conv2d_hwc1 m0, m1, m2, m3, a15, a5, a6, a8, a11, a12, a13, a14, a4 - - xtensa_fetch_accumulator a13, a14 - # xtensa_bias a13, a10, a14 - xtensa_relu a13, a9 - xtensa_saturation a13 - xtensa_store a2, a13 - - addi a7, a7, -1 # bump up pointer - bnez a7, xtensa_s16_conv2d_hwcn_relu_loop - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/dl_define.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/dl_define.hpp deleted file mode 100644 index f221ecb5..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/dl_define.hpp +++ /dev/null @@ -1,130 +0,0 @@ -#pragma once - -#include "dl_define_private.hpp" -#include "sdkconfig.h" -#include -#include - -#define DL_LOG_LATENCY_UNIT 0 /* (high)) ? (high) : (x)) -#endif - -#ifndef DL_ABS -#define DL_ABS(x) ((x) < 0 ? (-(x)) : (x)) -#endif - -#ifndef DL_RIGHT_SHIFT -#define DL_RIGHT_SHIFT(x, shift) (((shift) > 0) ? ((x) >> (shift)) : ((x) << -(shift))) -#endif - -#ifndef DL_LEFT_SHIFT -#define DL_LEFT_SHIFT(x, shift) (((shift) > 0) ? ((x) << (shift)) : ((x) >> -(shift))) -#endif - -#ifndef DL_SCALE -#define DL_SCALE(exponent) (((exponent) > 0) ? (1 << (exponent)) : ((float)1.0 / (1 << -(exponent)))) -#endif - -#ifndef DL_RESCALE -#define DL_RESCALE(exponent) (((exponent) > 0) ? ((float)1.0 / (1 << (exponent))) : (1 << -(exponent))) -#endif - -#define DL_QUANT8_MAX 127 -#define DL_QUANT8_MIN -128 -#define DL_QUANT16_MAX 32767 -#define DL_QUANT16_MIN -32768 -#define DL_QUANT32_MAX 2147483647 -#define DL_QUANT32_MIN -2147483648 - -#define QIQO 0 -#define QIFO 1 - -namespace dl { -typedef enum { - QUANT_TYPE_NONE, /*Unknown quantization type*/ - QUANT_TYPE_FLOAT32, /**/ - QUANT_TYPE_SYMM_8BIT, /**/ - QUANT_TYPE_SYMM_16BIT, /**/ - QUANT_TYPE_SYMM_32BIT, /**/ -} quant_type_t; - -typedef enum { - Linear, /**/ - ReLU, /**/ - LeakyReLU, /**/ - PReLU, /**/ - // TODO: ReLU6 -} activation_type_t; - -typedef enum { - PADDING_NOT_SET, - PADDING_VALID, /**/ - PADDING_SAME_BEGIN, /**/ - PADDING_SAME_END, /**/ -} padding_type_t; - -typedef enum { - PADDING_EMPTY, - PADDING_CONSTANT, - PADDING_EDGE, - PADDING_REFLECT, - PADDING_SYMMETRIC, -} padding_mode_t; - -typedef enum { RESIZE_NEAREST, RESIZE_LINEAR, RESIZE_CUBIC } resize_mode_t; - -/** - * @brief The mode of esp-dl runtime, single-core or multi-core - */ -typedef enum { - RUNTIME_MODE_AUTO = 0, // Automatically select single-core or multi-core runtime - RUNTIME_MODE_SINGLE_CORE = 1, // Always select single-core runtime - RUNTIME_MODE_MULTI_CORE = 2, // Always select multi-core runtime(dual core for ESP32-S3 and ESP32-P4) -} runtime_mode_t; -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/dl_define_private.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/dl_define_private.hpp deleted file mode 100644 index 49e8d9c6..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/dl_define_private.hpp +++ /dev/null @@ -1,39 +0,0 @@ -/** - * @file dl_define_private.hpp - * @brief All macro here is for internal only. Once the project is compiled to static library, these macro is not - * effective. - * @version 0.1 - * @date 2021-07-02 - * - * @copyright Copyright (c) 2021 - * - */ - -#pragma once -#include "sdkconfig.h" - -#define DL_S16_BUFFER_TYPE \ - int64_t /**/ -#define DL_LOG_DETECT_LATENCY 0 /* -namespace dl { -namespace lut { -const int8_t sigmoid_exp_neg13_lut[256] = { - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}; -const int8_t sigmoid_exp_neg12_lut[256] = { - 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, - 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, - 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65}; -const int8_t sigmoid_exp_neg11_lut[256] = { - 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, - 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, - 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, - 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, - 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66}; -const int8_t sigmoid_exp_neg10_lut[256] = { - 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, - 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, - 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, - 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, - 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68}; -const int8_t sigmoid_exp_neg9_lut[256] = { - 56, 56, 56, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, - 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, - 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, - 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, - 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 69, 69, - 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, - 70, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 72, 72, 72, 72, 72, 72, 72}; -const int8_t sigmoid_exp_neg8_lut[256] = { - 48, 48, 49, 49, 49, 49, 49, 49, 49, 49, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, - 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 54, 54, 55, 55, 55, 55, 55, 55, - 55, 55, 56, 56, 56, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 58, 58, 59, 59, 59, - 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 62, 62, 62, - 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, - 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 70, - 70, 70, 70, 70, 70, 70, 70, 71, 71, 71, 71, 71, 71, 71, 71, 72, 72, 72, 72, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73, - 73, 73, 74, 74, 74, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76, 76, 76, 77, 77, - 77, 77, 77, 77, 77, 77, 78, 78, 78, 78, 78, 78, 78, 78, 78, 79, 79, 79, 79, 79, 79, 79, 79, 80}; -const int8_t sigmoid_exp_neg7_lut[256] = { - 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 39, 39, 39, 40, 40, 40, 40, - 40, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 46, 47, - 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52, 52, 53, 53, 53, 53, 54, - 54, 54, 54, 55, 55, 55, 55, 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59, 60, 60, 60, 60, 61, 61, - 61, 61, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 68, 68, - 68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 71, 71, 71, 71, 72, 72, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, - 75, 76, 76, 76, 76, 77, 77, 77, 77, 78, 78, 78, 78, 78, 79, 79, 79, 79, 80, 80, 80, 80, 81, 81, 81, 81, 82, 82, 82, - 82, 82, 83, 83, 83, 83, 84, 84, 84, 84, 85, 85, 85, 85, 85, 86, 86, 86, 86, 86, 87, 87, 87, 87, 88, 88, 88, 88, 88, - 89, 89, 89, 89, 90, 90, 90, 90, 90, 91, 91, 91, 91, 91, 92, 92, 92, 92, 92, 93, 93, 93, 93, 93}; -const int8_t sigmoid_exp_neg6_lut[256] = { - 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, - 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26, 26, 27, - 27, 27, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 34, 34, 34, 35, - 35, 36, 36, 36, 37, 37, 38, 38, 38, 39, 39, 40, 40, 41, 41, 42, 42, 42, 43, 43, 44, 44, - 45, 45, 46, 46, 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, - 55, 56, 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64, 64, 64, 65, 65, - 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72, 72, 73, 73, 74, 74, 75, 75, 76, 76, - 77, 77, 78, 78, 79, 79, 80, 80, 81, 81, 82, 82, 82, 83, 83, 84, 84, 85, 85, 86, 86, 86, - 87, 87, 88, 88, 89, 89, 90, 90, 90, 91, 91, 92, 92, 92, 93, 93, 94, 94, 94, 95, 95, 96, - 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100, 100, 101, 101, 101, 102, 102, 102, 102, 103, 103, - 103, 104, 104, 104, 105, 105, 105, 106, 106, 106, 106, 107, 107, 107, 107, 108, 108, 108, 109, 109, 109, 109, - 110, 110, 110, 110, 111, 111, 111, 111, 111, 112, 112, 112, 112, 113}; -const int8_t sigmoid_exp_neg5_lut[256] = { - 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, - 9, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 16, - 16, 17, 17, 17, 18, 18, 19, 19, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, - 29, 29, 30, 31, 31, 32, 33, 34, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, - 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, - 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 82, 83, 84, 85, 86, 87, 88, - 89, 90, 90, 91, 92, 93, 94, 94, 95, 96, 97, 97, 98, 99, 99, 100, 101, 102, 102, 103, 103, 104, - 105, 105, 106, 106, 107, 107, 108, 109, 109, 110, 110, 111, 111, 111, 112, 112, 113, 113, 114, 114, 114, 115, - 115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 119, 119, 119, 119, 120, 120, 120, 120, 121, 121, 121, - 121, 121, 122, 122, 122, 122, 122, 122, 123, 123, 123, 123, 123, 123, 124, 124, 124, 124, 124, 124, 124, 124, - 124, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 126, 126}; -const int8_t sigmoid_exp_neg4_lut[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, - 10, 10, 11, 12, 12, 13, 14, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 29, 30, - 31, 33, 34, 36, 38, 39, 41, 43, 45, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, - 72, 74, 76, 78, 80, 82, 83, 85, 87, 89, 90, 92, 94, 95, 97, 98, 99, 101, 102, 103, 105, 106, - 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 116, 117, 118, 118, 119, 119, 120, 120, 121, 121, 122, - 122, 122, 123, 123, 123, 124, 124, 124, 124, 124, 125, 125, 125, 125, 125, 126, 126, 126, 126, 126, 126, 126, - 126, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}; -const int8_t sigmoid_exp_neg3_lut[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, 11, - 12, 14, 15, 17, 19, 21, 23, 26, 29, 31, 34, 38, 41, 45, 48, 52, 56, 60, 64, 68, 72, 76, - 80, 83, 87, 90, 94, 97, 99, 102, 105, 107, 109, 111, 113, 114, 116, 117, 118, 119, 120, 121, 122, 123, - 123, 124, 124, 125, 125, 125, 126, 126, 126, 126, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}; -const int8_t sigmoid_exp_neg2_lut[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, - 1, 2, 2, 3, 4, 5, 6, 8, 10, 12, 15, 19, 23, 29, 34, 41, 48, 56, 64, 72, 80, 87, - 94, 99, 105, 109, 113, 116, 118, 120, 122, 123, 124, 125, 126, 126, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}; -const int8_t sigmoid_exp_neg1_lut[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 4, 6, 10, 15, 23, 34, 48, 64, 80, 94, 105, - 113, 118, 122, 124, 126, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}; -const int8_t sigmoid_exp_0_lut[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 15, 34, 64, 94, 113, 122, - 126, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}; -const int8_t sigmoid_exp_1_lut[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 15, 64, 113, 126, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}; -const int8_t sigmoid_exp_2_lut[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 64, 126, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}; -const int8_t sigmoid_exp_3_lut[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, - 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}; -inline const int8_t *get_sigmoid_lut(int exp) -{ - const int8_t *lut_table; - switch (exp) { - case -13: - lut_table = sigmoid_exp_neg13_lut; - break; - case -12: - lut_table = sigmoid_exp_neg12_lut; - break; - case -11: - lut_table = sigmoid_exp_neg11_lut; - break; - case -10: - lut_table = sigmoid_exp_neg10_lut; - break; - case -9: - lut_table = sigmoid_exp_neg9_lut; - break; - case -8: - lut_table = sigmoid_exp_neg8_lut; - break; - case -7: - lut_table = sigmoid_exp_neg7_lut; - break; - case -6: - lut_table = sigmoid_exp_neg6_lut; - break; - case -5: - lut_table = sigmoid_exp_neg5_lut; - break; - case -4: - lut_table = sigmoid_exp_neg4_lut; - break; - case -3: - lut_table = sigmoid_exp_neg3_lut; - break; - case -2: - lut_table = sigmoid_exp_neg2_lut; - break; - case -1: - lut_table = sigmoid_exp_neg1_lut; - break; - case 0: - lut_table = sigmoid_exp_0_lut; - break; - case 1: - lut_table = sigmoid_exp_1_lut; - break; - case 2: - lut_table = sigmoid_exp_2_lut; - break; - case 3: - lut_table = sigmoid_exp_3_lut; - break; - default: - if (exp < -13) - lut_table = sigmoid_exp_neg13_lut; - else - lut_table = sigmoid_exp_3_lut; - } - return lut_table; -} -} // namespace lut -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/math/include/dl_math.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/math/include/dl_math.hpp deleted file mode 100644 index e09c041d..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/math/include/dl_math.hpp +++ /dev/null @@ -1,219 +0,0 @@ -#pragma once - -#include "dl_define.hpp" -#include "dl_tool.hpp" -#include - -namespace dl { -namespace math { -/** - * @brief x^a. - * - * @param x as a base - * @param a as an exponent - * @return x^a - */ -inline float power(float x, int a) -{ - if (a > 0) { - return x * power(x, a - 1); - } else if (a < 0) { - return 1 / (x * power(x, -a - 1)); - } else { - return 1.f; - } -} - -/** - * @brief sqrt(x). - * - * @param x as a base - * @return sqrt(x) - */ -inline float sqrt_quick(float x) -{ - const int result = 0x1fbb4000 + (*(int *)&x >> 1); - return *(float *)&result; -} - -/** - * @brief 1/sqrt(x). - * - * @param x as a base - * @return 1/sqrt(x) - */ -inline float sqrt_reciprocal_quick(float x) -{ - float xhalf = 0.5f * x; - int i = *(int *)&x; // get bits for floating value - i = 0x5f375a86 - (i >> 1); // gives initial guess y0 - x = *(float *)&i; // convert bits back to float - x = x * (1.5f - xhalf * x * x); // Newton step, repeating increases accuracy - return x; -} - -static const float EN = 0.00001f; - -/** - * @brief sqrt(x). - * - * @param x as a base - * @return sqrt(x) - */ -inline float sqrt_newton(float x) -{ - /** - * Use Newton iteration method to find the square root - * */ - if (x == 0.f) - return 0.f; - float result = x; - float last_value; - do { - last_value = result; - result = (last_value + x / last_value) * 0.5; - } while (DL_ABS(result - last_value) > EN); - return result; -} - -/** - * @brief n-th root of x. - * - * @param x as a base - * @param n root times - * @return n-th root of x - */ -inline float root_newton(float x, int n) -{ - if (n == 2) - return sqrt_newton(x); - if (n == 0) - return 1.f; - if (n == 1) - return x; - if (x == 0.f) - return 0.f; - float result = x; - float last_value; - float _n = (float)((n - 1) * n); - do { - last_value = result; - result = _n * last_value + x / (n * power(last_value, n - 1)); - } while (DL_ABS(result - last_value) > EN); - return result; -} - -/** - * @brief atan(x). - * - * @param x as an input - * @return atan(x) in range [-pi/2, pi/2] - */ -inline float atan(float x) -{ - return x * (0.78539816 - (DL_ABS(x) - 1) * (0.2447 + 0.0663 * DL_ABS(x))); - // float s = x*x; - // return ((-0.0464964749 * s + 0.15931422) * s - 0.327622764) * s * x + x; -} - -// TODO:@yuanjiong -/** - * @brief - * - * @param x - * @param y - * @return in range [-pi, pi] - */ -inline float atan2(float x, float y) -{ - float ax = DL_ABS(x); - float ay = DL_ABS(y); - float eps = 1e-8; - float a = DL_MIN(ax, ay) / (DL_MAX(ax, ay) + eps); - float r = atan(a); //[0, pi/2] - if (ay > ax) - r = 1.57079633 - r; - if (x < 0) - r = 3.14159265 - r; - if (y < 0) - r = -r; - - return r; -} - -/** - * @brief acos(x). - * - * @param x as an input - * @return acos(x) in range [-pi/2, pi/2] - */ -inline float acos(float x) -{ - return atan2(x, sqrt_newton(1.0 - x * x)); -} - -/** - * @brief asin(x). - * - * @param x as an input - * @return asin(x) in range [0, pi] - */ -inline float asin(float x) -{ - return atan2(sqrt_newton(1.0 - x * x), x); -} - -/** - * @brief e^x - * - * @param x exponent - * @param steps iteration steps - * @return e^x - */ -inline float exp_fast(float x, int steps = 8) -{ - x = 1.0 + x / (1 << steps); - for (int i = 0; i < steps; i++) x *= x; - return x; -} - -inline float sigmoid(float x) -{ - return 1.0 / (1.0 + expf(-x)); -} - -inline float tanh(float x) -{ - return 2 * sigmoid(2 * x) - 1; -} - -inline float inverse_sigmoid(float x) -{ - return -logf(1.0 / x - 1); -} - -inline void softmax(float *x, int num) -{ - float max_input = x[0]; - for (int i = 1; i < num; i++) max_input = DL_MAX(max_input, x[i]); - - float sum = 0; - for (int i = 0; i < num; i++) { - x[i] = expf(x[i] - max_input); - sum += x[i]; - } - - for (int i = 0; i < num; i++) x[i] /= sum; -} - -inline float dfl_integral(float *data, int reg_max = 7) -{ - softmax(data, reg_max + 1); - float sum = 0; - for (int i = 0; i <= reg_max; i++) { - sum += data[i] * i; - } - return sum; -} -} // namespace math -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/math/include/dl_math_matrix.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/math/include/dl_math_matrix.hpp deleted file mode 100644 index 53c54eb9..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/math/include/dl_math_matrix.hpp +++ /dev/null @@ -1,352 +0,0 @@ -#pragma once - -#include "dl_define.hpp" -#include "dl_tool.hpp" -#include "dl_variable.hpp" -#include "esp_timer.h" -#include -#include -#include -#include - -namespace dl { -namespace math { -/** - * @brief the Matrix class - * - * @tparam T - */ -template -class Matrix { -public: - T **array; - int h; - int w; - Matrix() : h(0), w(0) { this->array = NULL; } - - Matrix(int h, int w) : h(h), w(w) { this->calloc_element(); } - - Matrix(int h, int w, T s) : h(h), w(w) - { - this->calloc_element(); - this->set_value(s); - } - - Matrix(const Matrix &mat) : h(mat.h), w(mat.w) - { - this->calloc_element(); - this->set_value(mat); - } - virtual ~Matrix() - { - if (this->array != NULL) { - for (int i = 0; i < this->h; i++) { - free(this->array[i]); - } - free(this->array); - this->array = NULL; - } - } - - /** - * @brief calloc the matrix element - * - */ - void calloc_element() - { - if ((this->h > 0) && (this->w > 0)) { - this->array = (T **)calloc(this->h, sizeof(T *)); - for (int i = 0; i < this->h; i++) { - this->array[i] = (T *)calloc(this->w, sizeof(T)); - } - } else { - this->array = NULL; - } - } - - /** - * @brief Set the matrix element to random number. - * - * @param thresh the max abs value of the element. - */ - void set_random(T thresh = 1) - { - unsigned int seed = esp_timer_get_time(); - srand(seed); - for (int i = 0; i < this->h; i++) { - for (int j = 0; j < this->w; j++) { - this->array[i][j] = ((T)rand()) / (T)(RAND_MAX)*thresh; - } - } - } - - /** - * @brief Set the small value to zero - * - * @param thresh the threshold of small value - */ - void set_zero(T thresh = 1e-8) - { - for (int i = 0; i < this->h; i++) { - for (int j = 0; j < this->w; j++) { - if (DL_ABS(this->array[i][j]) < thresh) { - this->array[i][j] = 0; - } - } - } - } - - /** - * @brief Set the matrix value from a vector - * - * @tparam TT - * @param mat the input vector - */ - template - void set_value(std::vector mat) - { - int area = this->w * this->h; - assert(area == mat.size()); - int index = 0; - for (int i = 0; i < this->h; i++) { - for (int j = 0; j < this->w; j++) { - this->array[i][j] = (T)(mat[index++]); - } - } - } - - /** - * @brief Set the matrix value from another matrix. - * - * @tparam TT - * @param mat the input matrix. - */ - template - void set_value(const Matrix &mat) - { - assert((this->h == mat.h) && (this->w == mat.w)); - for (int i = 0; i < this->h; i++) { - for (int j = 0; j < this->w; j++) { - this->array[i][j] = (T)(mat.array[i][j]); - } - } - } - - /** - * @brief Set a part of the matrix value from another matrix. - * - * @param h_start the start index of height - * @param h_end the end index of height - * @param w_start the start index of width - * @param w_end the end index of width - * @param mat the input matrix - */ - void set_value(int h_start, int h_end, int w_start, int w_end, const Matrix &mat) - { - int h = h_end - h_start; - int w = w_end - w_start; - - assert((h == mat.h) && (w == mat.w)); - assert((h_end <= this->h) && (w_end <= this->w) && (h_start >= 0) && (w_start >= 0)); - for (int i = 0; i < h; i++) { - for (int j = 0; j < w; j++) { - this->array[i + h_start][j + w_start] = mat.array[i][j]; - } - } - } - - /** - * @brief Set the matrix value to a constant. - * - * @tparam TT - * @param s the input value. - */ - template - void set_value(TT s) - { - for (int i = 0; i < this->h; i++) { - for (int j = 0; j < this->w; j++) { - this->array[i][j] = (T)s; - } - } - } - - /** - * @brief print the matrix element. - * - */ - void print_value() const - { - printf("h: %d, w: %d\n", this->h, this->w); - for (int i = 0; i < this->h; i++) { - for (int j = 0; j < this->w; j++) { - printf("%f ", (float)(this->array[i][j])); - } - printf("\n"); - } - } - - /** - * @brief do matrix multiply - * - * @param input the input matrix - * @return Matrix the output matrix - */ - Matrix matmul(const Matrix &input) const; - - /** - * @brief transpose the matrix - * - * @return Matrix the transposed matrix - */ - Matrix transpose() const; - - /** - * @brief get the inverse matrix - * - * @return Matrix the output matrix - */ - Matrix inverse() const; - - /** - * @brief get the diagonal of the matrix - * - * @return Matrix the diagonal - */ - Matrix diagonal() const; - - /** - * @brief slice the matrix - * - * @param h_start the start index of height - * @param h_end the end index of height - * @param w_start the start index of width - * @param w_end the end index of width - * @return Matrix the output. - */ - Matrix slice(int h_start, int h_end, int w_start, int w_end) const; - - /** - * @brief get an identity matrix - * - * @param n the dim of the identity matrix - * @return Matrix the output - */ - static Matrix identity(int n) - { - Matrix A(n, n); - for (int i = 0; i < n; ++i) { - A.array[i][i] = 1; - } - return A; - } - - /** - * @brief get a diag matrix - * - * @param d the diagonal value. - * @return Matrix the output - */ - static Matrix diag(const Matrix &d) - { - assert(d.h == 1); - Matrix A(d.w, d.w); - for (int i = 0; i < d.w; ++i) { - A.array[i][i] = d.array[0][i]; - } - return A; - } - - static Matrix arange(uint32_t n) - { - Matrix A(1, n); - for (int i = 0; i < n; ++i) { - A.array[0][i] = i; - } - return A; - } - - static Matrix arange(uint32_t n1, uint32_t n2) - { - int len = n2 - n1; - assert(len > 0); - Matrix A(1, len); - for (int i = 0; i < len; ++i) { - A.array[0][i] = n1 + i; - } - - return A; - } - - /** - * @brief get the F_norm of the matrix - * - * @return T the output F_norm - */ - T F_norm() const - { - T f_n = 0.0; - for (int i = 0; i < this->h; ++i) { - for (int j = 0; j < this->w; ++j) { - f_n += (this->array[i][j] * this->array[i][j]); - } - } - f_n = sqrt_newton(f_n); - return f_n; - } - - Matrix &operator=(const Matrix &A) - { - if ((A.h == this->h) && (A.w == this->w)) { - for (int i = 0; i < A.h; ++i) { - for (int j = 0; j < A.w; ++j) { - this->array[i][j] = A.array[i][j]; - } - } - } else { - if (this->array != NULL) { - for (int i = 0; i < this->h; ++i) { - free(this->array[i]); - } - free(this->array); - this->array = NULL; - } - this->h = A.h; - this->w = A.w; - if ((A.h > 0) && (A.w > 0)) { - this->calloc_element(); - this->set_value(A); - } - } - return *this; - } -}; - -/** - * @brief Get the affine transform matrix - * - * @param source_coord the source coordinates - * @param dest_coord the target coordinates - * @return Matrix the output matrix - */ -Matrix get_affine_transform(Matrix &source_coord, Matrix &dest_coord); - -/** - * @brief Get the similarity transform matrix - * - * @param source_coord the source coordinates - * @param dest_coord the target coordinates - * @return Matrix the output matrix - */ -Matrix get_similarity_transform(Matrix &source_coord, Matrix &dest_coord); - -/** - * @brief Get the perspective transform matrix - * - * @param source_coord the source coordinates - * @param dest_coord the target coordinates - * @return Matrix the output matrix - */ -Matrix get_perspective_transform(Matrix &source_coord, Matrix &dest_coord); -} // namespace math -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/math/src/dl_math.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/math/src/dl_math.cpp deleted file mode 100644 index 72633cae..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/math/src/dl_math.cpp +++ /dev/null @@ -1,6 +0,0 @@ -#include "dl_math.hpp" - -namespace dl { -namespace math { -} -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/math/src/dl_math_matrix.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/math/src/dl_math_matrix.cpp deleted file mode 100644 index ddefcebd..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/math/src/dl_math_matrix.cpp +++ /dev/null @@ -1,419 +0,0 @@ -#include "dl_math_matrix.hpp" -#include "dl_math.hpp" - -namespace dl { -namespace math { -template -Matrix Matrix::matmul(const Matrix &input) const -{ - assert(input.h == this->w); - Matrix output(this->h, input.w); - for (int i = 0; i < output.h; i++) { - for (int j = 0; j < output.w; j++) { - for (int k = 0; k < this->w; k++) { - output.array[i][j] += this->array[i][k] * input.array[k][j]; - } - } - } - return output; -} -template Matrix Matrix::matmul(const Matrix &input) const; -template Matrix Matrix::matmul(const Matrix &input) const; -template Matrix Matrix::matmul(const Matrix &input) const; -template Matrix Matrix::matmul(const Matrix &input) const; -template Matrix Matrix::matmul(const Matrix &input) const; -template Matrix Matrix::matmul(const Matrix &input) const; -template Matrix Matrix::matmul(const Matrix &input) const; - -template -Matrix Matrix::transpose() const -{ - Matrix trans(this->w, this->h); - for (int i = 0; i < this->h; i++) { - for (int j = 0; j < this->w; j++) { - trans.array[j][i] = this->array[i][j]; - } - } - return trans; -} -template Matrix Matrix::transpose() const; -template Matrix Matrix::transpose() const; -template Matrix Matrix::transpose() const; -template Matrix Matrix::transpose() const; -template Matrix Matrix::transpose() const; -template Matrix Matrix::transpose() const; -template Matrix Matrix::transpose() const; - -template -Matrix Matrix::inverse() const -{ - Matrix inv(this->h, this->w); - if (this->w != this->h) { - printf("this matrix is not a square matrix !\n"); - return inv; - } - - Matrix matw(this->h, 2 * (this->w)); - T **w = matw.array; - float eps = 1e-6; - - for (int i = 0; i < matw.h; i++) { - for (int j = 0; j < this->w; j++) { - w[i][j] = this->array[i][j]; - } - w[i][(this->w) + i] = 1; - } - - for (int i = 0; i < matw.h; i++) { - if (DL_ABS(w[i][i]) < eps) { - int j; - for (j = i + 1; j < matw.h; j++) { - if (DL_ABS(w[j][i]) > eps) - break; - } - if (j == matw.h) { - printf("This matrix is irreversible!\n"); - return inv; - } - for (int k = i; k < matw.w; k++) { - w[i][k] += w[j][k]; - } - } - float factor = w[i][i]; - for (int k = i; k < matw.w; k++) { - w[i][k] /= factor; - } - for (int k = i + 1; k < matw.h; k++) { - factor = -w[k][i]; - for (int l = i; l < matw.w; l++) { - w[k][l] += (factor * w[i][l]); - } - } - } - for (int i = (matw.h) - 1; i > 0; i--) { - for (int j = i - 1; j >= 0; j--) { - float factor = -w[j][i]; - for (int k = i; k < matw.w; k++) { - w[j][k] += (factor * w[i][k]); - } - } - } - for (int i = 0; i < this->h; i++) { - for (int j = 0; j < this->w; j++) { - inv.array[i][j] = w[i][(this->h) + j]; - } - } - return inv; -} -template Matrix Matrix::inverse() const; -template Matrix Matrix::inverse() const; -template Matrix Matrix::inverse() const; -template Matrix Matrix::inverse() const; -template Matrix Matrix::inverse() const; -template Matrix Matrix::inverse() const; -template Matrix Matrix::inverse() const; - -template -Matrix Matrix::diagonal() const -{ - int rank = DL_MIN(this->h, this->w); - Matrix diag_m(1, rank); - for (int i = 0; i < rank; ++i) { - diag_m.array[0][i] = this->array[i][i]; - } - return diag_m; -} -template Matrix Matrix::diagonal() const; -template Matrix Matrix::diagonal() const; -template Matrix Matrix::diagonal() const; -template Matrix Matrix::diagonal() const; -template Matrix Matrix::diagonal() const; -template Matrix Matrix::diagonal() const; -template Matrix Matrix::diagonal() const; - -template -Matrix Matrix::slice(int h_start, int h_end, int w_start, int w_end) const -{ - assert((h_end >= h_start) && (w_end >= w_start) && (h_start >= 0) && (w_start >= 0)); - int slice_h = DL_MIN((h_end - h_start), (this->h - h_start)); - int slice_w = DL_MIN((w_end - w_start), (this->w - w_start)); - Matrix slice_m(slice_h, slice_w); - for (int i = 0; i < slice_h; ++i) { - for (int j = 0; j < slice_w; ++j) { - slice_m.array[i][j] = this->array[h_start + i][w_start + j]; - } - } - return slice_m; -} -template Matrix Matrix::slice(int h_start, int h_end, int w_start, int w_end) const; -template Matrix Matrix::slice(int h_start, int h_end, int w_start, int w_end) const; -template Matrix Matrix::slice(int h_start, int h_end, int w_start, int w_end) const; -template Matrix Matrix::slice(int h_start, int h_end, int w_start, int w_end) const; -template Matrix Matrix::slice(int h_start, int h_end, int w_start, int w_end) const; -template Matrix Matrix::slice(int h_start, int h_end, int w_start, int w_end) const; -template Matrix Matrix::slice(int h_start, int h_end, int w_start, int w_end) const; - -Matrix get_affine_transform(Matrix &source_coord, Matrix &dest_coord) -{ - Matrix m(3, 3); - float Ainv[3][3] = {0}; - float Adet = - (source_coord.array[0][0] * source_coord.array[1][1] + source_coord.array[0][1] * source_coord.array[2][0] + - source_coord.array[1][0] * source_coord.array[2][1]) - - (source_coord.array[2][0] * source_coord.array[1][1] + source_coord.array[1][0] * source_coord.array[0][1] + - source_coord.array[0][0] * source_coord.array[2][1]); - if (Adet == 0) { - printf("the src is linearly dependent\n"); - return m; - } - Ainv[0][0] = (source_coord.array[1][1] - source_coord.array[2][1]) / Adet; - Ainv[0][1] = (source_coord.array[2][1] - source_coord.array[0][1]) / Adet; - Ainv[0][2] = (source_coord.array[0][1] - source_coord.array[1][1]) / Adet; - Ainv[1][0] = (source_coord.array[2][0] - source_coord.array[1][0]) / Adet; - Ainv[1][1] = (source_coord.array[0][0] - source_coord.array[2][0]) / Adet; - Ainv[1][2] = (source_coord.array[1][0] - source_coord.array[0][0]) / Adet; - Ainv[2][0] = - (source_coord.array[1][0] * source_coord.array[2][1] - source_coord.array[2][0] * source_coord.array[1][1]) / - Adet; - Ainv[2][1] = - (source_coord.array[2][0] * source_coord.array[0][1] - source_coord.array[0][0] * source_coord.array[2][1]) / - Adet; - Ainv[2][2] = - (source_coord.array[0][0] * source_coord.array[1][1] - source_coord.array[0][1] * source_coord.array[1][0]) / - Adet; - - for (int i = 0; i < 3; i++) { - m.array[0][i] = Ainv[i][0] * dest_coord.array[0][0] + Ainv[i][1] * dest_coord.array[1][0] + - Ainv[i][2] * dest_coord.array[2][0]; - m.array[1][i] = Ainv[i][0] * dest_coord.array[0][1] + Ainv[i][1] * dest_coord.array[1][1] + - Ainv[i][2] * dest_coord.array[2][1]; - } - m.array[2][0] = 0; - m.array[2][1] = 0; - m.array[2][2] = 1; - return m; -} - -Matrix get_similarity_transform(Matrix &source_coord, Matrix &dest_coord) -{ - int num = source_coord.h; - int dim = 2; - double src_mean_x = 0.0; - double src_mean_y = 0.0; - double dst_mean_x = 0.0; - double dst_mean_y = 0.0; - Matrix T(3, 3); - - for (int i = 0; i < num; i++) { - src_mean_x += source_coord.array[i][0]; - src_mean_y += source_coord.array[i][1]; - dst_mean_x += dest_coord.array[i][0]; - dst_mean_y += dest_coord.array[i][1]; - } - src_mean_x /= num; - src_mean_y /= num; - dst_mean_x /= num; - dst_mean_y /= num; - - Matrix src_demean(num, 2); - Matrix dst_demean(num, 2); - for (int i = 0; i < num; i++) { - src_demean.array[i][0] = source_coord.array[i][0] - src_mean_x; - src_demean.array[i][1] = source_coord.array[i][1] - src_mean_y; - dst_demean.array[i][0] = dest_coord.array[i][0] - dst_mean_x; - dst_demean.array[i][1] = dest_coord.array[i][1] - dst_mean_y; - } - double A[2][2] = {0}; - for (int i = 0; i < num; i++) { - A[0][0] += (dst_demean.array[i][0] * src_demean.array[i][0] / num); - A[0][1] += (dst_demean.array[i][0] * src_demean.array[i][1] / num); - A[1][0] += (dst_demean.array[i][1] * src_demean.array[i][0] / num); - A[1][1] += (dst_demean.array[i][1] * src_demean.array[i][1] / num); - } - if ((A[0][0] == 0) && (A[0][1] == 0) && (A[1][0] == 0) && (A[1][1] == 0)) { - return T; - } - - double d[2] = {1, 1}; - if (((A[0][0] * A[1][1]) - A[0][1] * A[1][0]) < 0) { - d[1] = -1; - } - - //======================================================================SVD===================================================================== - double U[2][2] = {0}; - double V[2][2] = {0}; - double S[2] = {0}; - - double divide_temp = 0; - - double AAT[2][2] = {0}; - AAT[0][0] = A[0][0] * A[0][0] + A[0][1] * A[0][1]; - AAT[0][1] = A[0][0] * A[1][0] + A[0][1] * A[1][1]; - AAT[1][0] = A[1][0] * A[0][0] + A[1][1] * A[0][1]; - AAT[1][1] = A[1][0] * A[1][0] + A[1][1] * A[1][1]; - - double l1 = (AAT[0][0] + AAT[1][1] + - sqrt_newton((AAT[0][0] + AAT[1][1]) * (AAT[0][0] + AAT[1][1]) - - 4 * ((AAT[0][0] * AAT[1][1]) - (AAT[0][1] * AAT[1][0])))) / - 2.0; - double l2 = (AAT[0][0] + AAT[1][1] - - sqrt_newton((AAT[0][0] + AAT[1][1]) * (AAT[0][0] + AAT[1][1]) - - 4 * ((AAT[0][0] * AAT[1][1]) - (AAT[0][1] * AAT[1][0])))) / - 2.0; - S[0] = sqrt_newton(l1); - S[1] = sqrt_newton(l2); - - U[0][0] = 1.0; - divide_temp = l1 - AAT[1][1]; - if (divide_temp == 0) { - return T; - } - U[1][0] = AAT[1][0] / divide_temp; - double norm = sqrt_newton((U[0][0] * U[0][0]) + (U[1][0] * U[1][0])); - U[0][0] /= norm; - U[1][0] /= norm; - - U[0][1] = 1.0; - divide_temp = l2 - AAT[1][1]; - if (divide_temp == 0) { - return T; - } - U[1][1] = AAT[1][0] / divide_temp; - norm = sqrt_newton((U[0][1] * U[0][1]) + (U[1][1] * U[1][1])); - U[0][1] /= norm; - U[1][1] /= norm; - - if (U[0][1] * U[1][0] < 0) { - U[0][0] = -U[0][0]; - U[1][0] = -U[1][0]; - } - - double ATA[2][2] = {0}; - ATA[0][0] = A[0][0] * A[0][0] + A[1][0] * A[1][0]; - ATA[0][1] = A[0][0] * A[0][1] + A[1][0] * A[1][1]; - ATA[1][0] = A[0][1] * A[0][0] + A[1][1] * A[1][0]; - ATA[1][1] = A[0][1] * A[0][1] + A[1][1] * A[1][1]; - - V[0][0] = 1.0; - divide_temp = l1 - ATA[1][1]; - if (divide_temp == 0) { - return T; - } - V[0][1] = ATA[1][0] / divide_temp; - norm = sqrt_newton((V[0][0] * V[0][0]) + (V[0][1] * V[0][1])); - V[0][0] /= norm; - V[0][1] /= norm; - - V[1][0] = 1.0; - divide_temp = l2 - ATA[1][1]; - if (divide_temp == 0) { - return T; - } - V[1][1] = ATA[1][0] / divide_temp; - norm = sqrt_newton((V[1][0] * V[1][0]) + (V[1][1] * V[1][1])); - V[1][0] /= norm; - V[1][1] /= norm; - - if (V[0][1] * V[1][0] < 0) { - V[0][0] = -V[0][0]; - V[0][1] = -V[0][1]; - } - if ((S[0] * U[0][0] * V[0][0] + S[1] * U[0][1] * V[1][0]) * A[0][0] < 0) { - U[0][0] = -U[0][0]; - U[0][1] = -U[0][1]; - U[1][0] = -U[1][0]; - U[1][1] = -U[1][1]; - } - //============================================================================================================================================ - - if (DL_ABS((A[0][0] * A[1][1]) - A[0][1] * A[1][0]) < 1e-8) { - if ((((U[0][0] * U[1][1]) - U[0][1] * U[1][0]) * ((V[0][0] * V[1][1]) - V[0][1] * V[1][0])) > 0) { - T.array[0][0] = U[0][0] * V[0][0] + U[0][1] * V[1][0]; - T.array[0][1] = U[0][0] * V[0][1] + U[0][1] * V[1][1]; - T.array[1][0] = U[1][0] * V[0][0] + U[1][1] * V[1][0]; - T.array[1][1] = U[1][0] * V[0][1] + U[1][1] * V[1][1]; - } else { - double s = d[dim - 1]; - d[dim - 1] = -1; - T.array[0][0] = d[0] * U[0][0] * V[0][0] + d[1] * U[0][1] * V[1][0]; - T.array[0][1] = d[0] * U[0][0] * V[0][1] + d[1] * U[0][1] * V[1][1]; - T.array[1][0] = d[0] * U[1][0] * V[0][0] + d[1] * U[1][1] * V[1][0]; - T.array[1][1] = d[0] * U[1][0] * V[0][1] + d[1] * U[1][1] * V[1][1]; - d[dim - 1] = s; - } - } else { - T.array[0][0] = d[0] * U[0][0] * V[0][0] + d[1] * U[0][1] * V[1][0]; - T.array[0][1] = d[0] * U[0][0] * V[0][1] + d[1] * U[0][1] * V[1][1]; - T.array[1][0] = d[0] * U[1][0] * V[0][0] + d[1] * U[1][1] * V[1][0]; - T.array[1][1] = d[0] * U[1][0] * V[0][1] + d[1] * U[1][1] * V[1][1]; - } - - double Ex = 0.0; - double Ex2 = 0.0; - double Ey = 0.0; - double Ey2 = 0.0; - for (int i = 0; i < num; i++) { - Ex += src_demean.array[i][0]; - Ex2 += (src_demean.array[i][0] * src_demean.array[i][0]); - Ey += src_demean.array[i][1]; - Ey2 += (src_demean.array[i][1] * src_demean.array[i][1]); - } - Ex /= num; - Ex2 /= num; - Ey /= num; - Ey2 /= num; - double var_sum = (Ex2 - Ex * Ex) + (Ey2 - Ey * Ey); - double scale = (S[0] * d[0] + S[1] * d[1]) / var_sum; - - T.array[0][2] = dst_mean_x - scale * (T.array[0][0] * src_mean_x + T.array[0][1] * src_mean_y); - T.array[1][2] = dst_mean_y - scale * (T.array[1][0] * src_mean_x + T.array[1][1] * src_mean_y); - - T.array[0][0] *= scale; - T.array[0][1] *= scale; - T.array[1][0] *= scale; - T.array[1][1] *= scale; - T.array[2][0] = 0; - T.array[2][1] = 0; - T.array[2][2] = 1; - - return T; -} - -Matrix get_perspective_transform(Matrix &source_coord, Matrix &dest_coord) -{ - Matrix m(3, 3); - Matrix A(8, 8); - - for (int i = 0; i < 4; i++) { - A.array[i][0] = source_coord.array[i][0]; - A.array[i][1] = source_coord.array[i][1]; - A.array[i][2] = 1; - A.array[i][3] = 0; - A.array[i][4] = 0; - A.array[i][5] = 0; - A.array[i][6] = -dest_coord.array[i][0] * source_coord.array[i][0]; - A.array[i][7] = -dest_coord.array[i][0] * source_coord.array[i][1]; - } - for (int i = 4; i < 8; i++) { - A.array[i][0] = 0; - A.array[i][1] = 0; - A.array[i][2] = 0; - A.array[i][3] = source_coord.array[i - 4][0]; - A.array[i][4] = source_coord.array[i - 4][1]; - A.array[i][5] = 1; - A.array[i][6] = -dest_coord.array[i - 4][1] * source_coord.array[i - 4][0]; - A.array[i][7] = -dest_coord.array[i - 4][1] * source_coord.array[i - 4][1]; - } - Matrix Ainv = A.inverse(); - for (int i = 0; i < 8; i++) { - m.array[i / 3][i % 3] = - (((Ainv.array[i][0]) * dest_coord.array[0][0]) + ((Ainv.array[i][1]) * dest_coord.array[1][0]) + - ((Ainv.array[i][2]) * dest_coord.array[2][0]) + ((Ainv.array[i][3]) * dest_coord.array[3][0]) + - ((Ainv.array[i][4]) * dest_coord.array[0][1]) + ((Ainv.array[i][5]) * dest_coord.array[1][1]) + - ((Ainv.array[i][6]) * dest_coord.array[2][1]) + ((Ainv.array[i][7]) * dest_coord.array[3][1])); - } - m.array[2][2] = 1; - return m; -} -} // namespace math -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/include/dl_memory_manager.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/include/dl_memory_manager.hpp deleted file mode 100644 index c3aaff50..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/include/dl_memory_manager.hpp +++ /dev/null @@ -1,327 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" -#include "esp_heap_caps.h" -#include "fbs_model.hpp" -#include - -namespace dl { -namespace memory { - -/** - * @brief Memory manager base class, each model has its own memory manager - * TODO: share memory manager with different models - */ -class MemoryManagerBase { -private: - void *psram_root; // PSRAM root pointer - void *internal_root; // Internal ram pointer - -public: - std::vector tensors; // All tensors in the model - int alignment; // The root pointer needs to be aligned must be a power of two - std::map name2index; // Tensor name to index map - size_t internal_size; // The bytes of internal ram - size_t psram_size; // The bytes of psram - - /** - * @brief Construct a new MemoryManager object. - */ - MemoryManagerBase(size_t internal_size, int alignment = 16) : - psram_root(nullptr), - internal_root(nullptr), - tensors({}), - alignment(alignment), - internal_size(internal_size), - psram_size(0) - { - } - - /** - * @brief Destroy the MemoryManager object. Return resource. - */ - virtual ~MemoryManagerBase() { this->reset(); } - - /** - * @brief Allocate memory for each tensor, include all input and output tensors - * - * @param fbs_model FlatBuffer's Model - * @param execution_plan Topological sorted module list - * - * @return The output TensorBase vector - */ - virtual std::vector alloc(fbs::FbsModel *fbs_model, std::vector &execution_plan) - { - return {}; - } - - /** - * @brief Set preload address for module's parameters - * - * @param execution_plan Topological sorted module list - */ - virtual void set_preload_addr(std::vector execution_plan) {} - - /** - * @brief Reset the memory manager, free all memory for each tensor, include all input and output tensors - */ - virtual void reset(); - - /** - * @brief Get tensor by index - * - * @param index The tensor index, type: int - * - * @return The TensorBase pointer - */ - TensorBase *get_tensor(int index); - - /** - * @brief Get tensor by name - * - * @param name The tensor name, type: std::string - * - * @return The TensorBase pointer - */ - TensorBase *get_tensor(std::string &name); - - /** - * @brief Get tensor index by name - * - * @param name The tensor name, type: std::string - * - * @return The TensorBase index - */ - int get_tensor_index(std::string &name); - - /** - * @brief Allocate root pointer by dl::tool::calloc_aligned API - * - * @param internal_size Size, in bytes, of a chunk of Internal ram to allocate - * @param psram_size Size, in bytes, of a chunk of PSRAM to allocate - * - * @return true if success, false if fail - */ - bool root_calloc(size_t internal_size, size_t psram_size); - - /** - * @brief Allocate root pointer by dl::tool::calloc_aligned API - * - * @param psram_size Size, in bytes, of a chunk of PSRAM to allocate - * - * @return data pointer if success, nullptr if fail - */ - void *psram_root_calloc(size_t psram_size); - - /** - * @brief Allocate root pointer by dl::tool::calloc_aligned API - * - * @param internal_size Size, in bytes, of a chunk of Internal ram to allocate - * - * @return data pointer if success, nullptr if fail - */ - void *internal_root_calloc(size_t internal_size); - - /** - * @brief Free psram root and internal root pointer - */ - void root_free(); - - /** - * @brief Get psram root pointer - */ - void *get_psram_root() { return this->psram_root; } - - /** - * @brief Get internal ram root pointer - */ - void *get_internal_root() { return this->internal_root; } -}; - -/** - * @brief Tensor info, include tensor name, shape, dtype, size, time range - * and call times, which is used to plan model memory - */ -class TensorInfo { -private: - std::string name; - int time_begin; - int time_end; - std::vector shape; - dtype_t dtype; - int exponent; - size_t size; // Size, in bytes - uint32_t call_times; - uint32_t offset; // PSRAM offset - uint32_t internal_offset; // Internal ram offset, used to allocate tensor on both PSRAM and internal ram - bool is_internal; - TensorInfo *m_leader_tensor; - TensorInfo - *m_follower_dirty_tensor; // Only reference the follower tensor which will modify the data of leader tensor. - -public: - TensorInfo(std::string &name, - int time_begin, - int time_end, - std::vector shape, - dtype_t dtype, - int exponent, - bool is_internal = false); - - ~TensorInfo() {} - - void set_inplace_leader_tensor(TensorInfo *tensor); - - void set_inplace_follower_tensor(TensorInfo *tensor) { m_follower_dirty_tensor = tensor; } - - TensorInfo *get_inplace_follower_tensor() { return m_follower_dirty_tensor; } - - void update_time(int new_time); - - TensorBase *create_tensor(void *internal_root, void *psram_root); - - bool is_inplaced() { return this->m_leader_tensor != nullptr; } - - uint32_t get_offset() - { - if (m_leader_tensor) { - return m_leader_tensor->get_offset(); - } - return this->offset; - } - - void set_offset(uint32_t offset) - { - if (m_leader_tensor) { - m_leader_tensor->set_offset(offset); - } - this->offset = offset; - } - - uint32_t get_internal_offset() - { - if (m_leader_tensor) { - return m_leader_tensor->get_internal_offset(); - } - return this->internal_offset; - } - - bool get_internal_state() - { - if (m_leader_tensor) { - return m_leader_tensor->get_internal_state(); - } - return this->is_internal; - } - - void set_internal_state(bool is_internal) - { - if (m_leader_tensor) { - m_leader_tensor->set_internal_state(is_internal); - } - this->is_internal = is_internal; - } - - void set_internal_offset(uint32_t offset) - { - if (m_leader_tensor) { - m_leader_tensor->set_internal_offset(offset); - m_leader_tensor->set_internal_state(true); - } - this->is_internal = true; - this->internal_offset = offset; - } - - int get_time_end() - { - if (m_leader_tensor) { - return m_leader_tensor->get_time_end(); - } - return this->time_end; - } - - int get_time_begin() - { - if (m_leader_tensor) { - return m_leader_tensor->get_time_begin(); - } - return this->time_begin; - } - - size_t get_size() { return this->size; } - - std::string get_name() { return this->name; } - - std::vector get_shape() { return this->shape; } - - void print() - { - printf("name:%s, from %d to %d, size:%d, offset:(%ld, %ld)\n", - name.c_str(), - time_begin, - time_end, - size, - offset, - internal_offset); - } -}; - -/** - * @brief Memory chunk, include size, is free, offset, alignment and tensor, which is used to simulate memory allocation - */ -class MemoryChunk { -public: - size_t size; - bool is_free; - int offset; - int alignment; - TensorInfo *tensor; - - MemoryChunk(size_t size, int is_free, int alignment = 16); - - MemoryChunk(TensorInfo *tensor, int alignment = 16); - - ~MemoryChunk() {} - - /** - * @brief Merge continuous free chunk - */ - MemoryChunk *merge_free_chunk(MemoryChunk *chunk); - - /** - * @brief Insert tensor into free chunk - */ - MemoryChunk *insert(TensorInfo *tensor); - - /** - * @brief Extend free chunk and insert tensor - */ - MemoryChunk *extend(TensorInfo *tensor); - - /** - * @brief Free memory chunk, set is_free to true and set tensor to nullptr - */ - void free() - { - this->is_free = true; - this->tensor = nullptr; - } - - /** - * @brief get aligned size, which is 16/alignemt bytes aligned - */ - size_t get_aligned_size(size_t size); -}; - -/** - * @brief print memory list - */ -void print_memory_list(const char *tag, std::list &memory_list); - -/** - * @brief sort memory list by memory chunk size - */ -void sort_memory_list(std::list &memory_list); - -}; // namespace memory -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/include/dl_memory_manager_greedy.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/include/dl_memory_manager_greedy.hpp deleted file mode 100644 index ee8555a6..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/include/dl_memory_manager_greedy.hpp +++ /dev/null @@ -1,64 +0,0 @@ -#pragma once - -#include "dl_memory_manager.hpp" - -namespace dl { -namespace memory { - -/** - * @brief Greedy memory manager, allocate memory for each tensor in the order of the execution plan. - */ -class MemoryManagerGreedy : public MemoryManagerBase { - // refer to https://zhuanlan.zhihu.com/p/423688020 -private: - std::list memory_list; // - std::list free_list; - std::list internal_memory_list; - std::list internal_free_list; - - void get_tensor_info_from_fbs(fbs::FbsModel *fbs_model, - std::vector execution_plan, - std::vector &tensor_info); - - int simulate(std::vector &tensor_info, int node_num); - - int simulate_with_internal_memory(std::vector &tensor_info, int node_num); - - MemoryChunk *free_tensor(TensorInfo *tensor, - std::list &memory_list, - std::list &free_list); - - MemoryChunk *alloc_tensor(TensorInfo *tensor, int mode = 0); - - MemoryChunk *alloc_internal_tensor(TensorInfo *tensor, int mode = 0); - - void free_memory_list(); - -public: - MemoryManagerGreedy(int max_internal_size, int alignment = 16) : MemoryManagerBase(max_internal_size, alignment) {} - - ~MemoryManagerGreedy() { this->free_memory_list(); } - - /** - * @brief Allocate memory for each tensor, include all input and output tensors - * - * @param tensors The input TensorInfo vector - * - * @return The output TensorInfo vector - */ - std::vector alloc(fbs::FbsModel *fbs_model, std::vector &execution_plan); - - /** - * @brief Set preload address for module's parameters - * - * @param execution_plan Topological sorted module list - */ - void set_preload_addr(std::vector execution_plan); - - /** - * @brief Free memory, include all tensors and memory list - */ - void free(); -}; -} // namespace memory -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/include/dl_model_base.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/include/dl_model_base.hpp deleted file mode 100644 index 38d7af66..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/include/dl_model_base.hpp +++ /dev/null @@ -1,146 +0,0 @@ -#pragma once - -#include "dl_constant.hpp" -#include "dl_memory_manager.hpp" -#include "dl_module_base.hpp" -#include "dl_variable.hpp" -#include "esp_log.h" -#include "fbs_loader.hpp" -#include "fbs_model.hpp" -namespace dl { - -// currently only support MEMORY_MANAGER_GREEDY -typedef enum { MEMORY_MANAGER_GREEDY = 0, LINEAR_MEMORY_MANAGER = 1 } memory_manager_t; - -/** - * @brief Neural Network Model. - */ -class Model { -private: - fbs::FbsLoader *fbs_loader = nullptr; /**/ - fbs::FbsModel *fbs_model = nullptr; /**/ - std::vector - execution_plan; /**/ - dl::memory::MemoryManagerBase *memory_manager = nullptr; /**/ - std::map inputs; /* The map of model input's name and TensorBase* */ - std::map outputs; /* The map of model output's name and TensorBase* */ - std::string name; /* The name of model */ - int64_t version; /* The version of model */ - std::string doc_string; /* doc string of model*/ - -public: - Model() {} - - Model(const char *name, - fbs::model_location_type_t location = fbs::MODEL_LOCATION_IN_FLASH_RODATA, - int model_index = 0, - int internal_size = 0, - memory_manager_t mm_type = MEMORY_MANAGER_GREEDY, - uint8_t *key = nullptr) - { - if (this->load(name, location, model_index, key) == ESP_OK) { - this->build(internal_size, mm_type); - } - } - - Model(fbs::FbsModel *fbs_model, int internal_size = 0, memory_manager_t mm_type = MEMORY_MANAGER_GREEDY); - - /** - * @brief Destroy the Model object. - * - */ - virtual ~Model(); - - /** - * @brief Load model graph and parameters from flash or sdcard. - * - * @param rodata_address_or_partition_label_or_path - * The address of model data while location is MODEL_LOCATION_IN_FLASH_RODATA. - * The label of partition while location is MODEL_LOCATION_IN_FLASH_PARTITION. - * The path of model while location is MODEL_LOCATION_IN_SDCARD. - * @param location The model location. - * @param model_index The model index of packed models. - * @param key The key of encrypted model. - */ - virtual esp_err_t load(const char *rodata_address_or_partition_label_or_path, - fbs::model_location_type_t location = fbs::MODEL_LOCATION_IN_FLASH_RODATA, - int model_index = 0, - uint8_t *key = nullptr); - - /** - * @brief Load model graph and parameters from Flatbuffers model - * - * @param fbs_model The FlatBuffers model - */ - virtual esp_err_t load(fbs::FbsModel *fbs_model); - - /** - * @brief Allocate memory for the model. - * - * @param internal_size Internal ram size, in bytes - * @param mm_type Type of memory manager - * @param preload Whether to preload the model's parameters to internal ram (not implemented yet) - */ - virtual void build(size_t internal_size, memory_manager_t mm_type = MEMORY_MANAGER_GREEDY, bool preload = false); - - /** - * @brief Run the model module by module. - */ - virtual void run(runtime_mode_t mode = RUNTIME_MODE_SINGLE_CORE); - - /** - * @brief Run the model module by module. - * - * @param input The model input. - */ - virtual void run(TensorBase *input, runtime_mode_t mode = RUNTIME_MODE_AUTO); - - /** - * @brief Run the model module by module. - * - * @param user_inputs The model inputs. - * @param user_outputs It's for debug to pecify the output of the intermediate layer; Under normal use, there is no - * need to pass a value to this parameter. If no parameter is passed, the default is the - * graphical output, which can be obtained through Model::get_outputs(). - */ - virtual void run(std::map &user_inputs, - runtime_mode_t mode = RUNTIME_MODE_AUTO, - std::map user_outputs = {}); - - /** - * @brief Get inputs of model - * - * @return The map of model input's name and TensorBase* - */ - virtual std::map &get_inputs(); - - /** - * @brief Get intermediate TensorBase of model - * - * @return The intermediate TensorBase*. Note: When using memory manager, - * the content of TensorBase's data may be overwritten by the - * outputs of other operators. - */ - virtual TensorBase *get_intermediate(std::string name); - - /** - * @brief Get outputs of model - * - * @return The map of model output's name and TensorBase* - */ - virtual std::map &get_outputs(); - - /** - * @brief Print the model. - */ - virtual void print(); - - /** - * @brief Get the fbs model instance. - * - * @return fbs::FbsModel * - */ - virtual fbs::FbsModel *get_fbs_model() { return fbs_model; } -}; - -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/src/dl_memory_manager.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/src/dl_memory_manager.cpp deleted file mode 100644 index 505e3d14..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/src/dl_memory_manager.cpp +++ /dev/null @@ -1,290 +0,0 @@ -#include "dl_memory_manager.hpp" - -namespace dl { -namespace memory { - -/*oooooooooooooooooo00000000000000000000 MemoryManagerBase 00000000000000000000ooooooooooooooooo*/ - -void MemoryManagerBase::reset() -{ - if (!this->tensors.empty()) { - for (int i = 0; i < this->tensors.size(); ++i) { - delete tensors[i]; - } - this->tensors.clear(); - } - this->root_free(); - this->name2index.clear(); -} - -TensorBase *MemoryManagerBase::get_tensor(int index) -{ - if (index < 0 || index >= this->tensors.size()) { - return nullptr; - } - return this->tensors[index]; -} - -TensorBase *MemoryManagerBase::get_tensor(std::string &name) -{ - auto it = this->name2index.find(name); - if (it != name2index.end()) { - return tensors[it->second]; - } else { - return nullptr; - } - - return nullptr; -} - -int MemoryManagerBase::get_tensor_index(std::string &name) -{ - auto it = this->name2index.find(name); - if (it != name2index.end()) { - return it->second; - } else { - return -1; - } - - return -1; -} - -bool MemoryManagerBase::root_calloc(size_t internal_size, size_t psram_size) -{ - if (internal_size > 0) { - this->internal_root = tool::calloc_aligned(internal_size, 1, alignment, MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL); - if (this->internal_root == nullptr) { - return false; - } - } - - if (psram_size > 0) { - this->psram_root = tool::calloc_aligned(psram_size, 1, alignment, MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM); - if (this->psram_root == nullptr) { - return false; - } - } - - return true; -} - -void *MemoryManagerBase::psram_root_calloc(size_t psram_size) -{ - if (psram_size > 0) { - this->psram_root = tool::calloc_aligned(psram_size, 1, alignment, MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM); - if (this->psram_root) - this->psram_size = psram_size; - } - return this->psram_root; -} - -void *MemoryManagerBase::internal_root_calloc(size_t internal_size) -{ - if (internal_size > 0) { - this->internal_root = tool::calloc_aligned(internal_size, 1, alignment, MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL); - if (this->internal_root) - this->internal_size = internal_size; - } - return this->internal_root; -} - -void MemoryManagerBase::root_free() -{ - // In IDF, free(p) is equivalent to heap_caps_free(p). - if (this->internal_root) { - ::free(this->internal_root); - this->internal_root = nullptr; - } - if (this->psram_root) { - ::free(this->psram_root); - this->psram_root = nullptr; - } -} - -/*oooooooooooooooooo00000000000000000000 TensorInfo 00000000000000000000ooooooooooooooooo*/ - -TensorInfo::TensorInfo(std::string &name, - int time_begin, - int time_end, - std::vector shape, - dtype_t dtype, - int exponent, - bool is_internal) : - name(name), - time_begin(time_begin), - time_end(time_end), - dtype(dtype), - exponent(exponent), - is_internal(is_internal), - m_leader_tensor(nullptr), - m_follower_dirty_tensor(nullptr) -{ - if (shape.size() > 0) { - this->shape.push_back(shape[0]); - this->size = shape[0]; - for (int i = 1; i < shape.size(); i++) { - this->shape.push_back(shape[i]); - this->size *= shape[i]; - } - } - - if (dtype == DATA_TYPE_FLOAT || dtype == DATA_TYPE_INT32 || dtype == DATA_TYPE_UINT32) { - this->size = this->size * 4; - } else if (dtype == DATA_TYPE_UINT8 || dtype == DATA_TYPE_INT8) { - this->size = this->size * 1; - } else if (dtype == DATA_TYPE_UINT16 || dtype == DATA_TYPE_INT16) { - this->size = this->size * 2; - } - - this->call_times = 0; - this->offset = 0; - this->internal_offset = 0; -} - -void TensorInfo::set_inplace_leader_tensor(TensorInfo *tensor) -{ - this->m_leader_tensor = tensor; - if (tensor) { - if (tensor->time_end < this->time_end || this->time_end == -1) { - tensor->update_time(this->time_end); - } - } -} - -void TensorInfo::update_time(int new_time) -{ - if (m_leader_tensor) { // if inplace tensor is not null, update end time of inplace tensor - m_leader_tensor->update_time(new_time); - this->time_end = m_leader_tensor->time_end; - } else { - if (new_time == -1) { - this->time_end = -1; - } else { - if (new_time > this->time_end) { - this->time_end = new_time; - } - } - } - - this->call_times++; -} - -TensorBase *TensorInfo::create_tensor(void *internal_root, void *psram_root) -{ - TensorBase *tensor = nullptr; - uint8_t *element = nullptr; - - if (this->is_internal) { - element = (uint8_t *)internal_root + this->get_internal_offset(); - } else { - element = (uint8_t *)psram_root + this->get_offset(); - } - - tensor = new TensorBase(shape, element, exponent, dtype, false); - return tensor; -} - -/*oooooooooooooooooo00000000000000000000 MemoryChunk 00000000000000000000ooooooooooooooooo*/ - -MemoryChunk::MemoryChunk(size_t size, int is_free, int alignment) : size(size), is_free(is_free), alignment(alignment) -{ - this->tensor = nullptr; - this->offset = 0; -} - -MemoryChunk::MemoryChunk(TensorInfo *tensor, int alignment) -{ - this->alignment = alignment; - this->size = this->get_aligned_size(tensor->get_size()); - this->is_free = false; - this->tensor = tensor; - this->offset = 0; -} - -MemoryChunk *MemoryChunk::merge_free_chunk(MemoryChunk *chunk) -{ - if (chunk != nullptr) { - if (chunk->is_free) { - this->size = this->size + chunk->size; - if (chunk->offset < this->offset) { - this->offset = chunk->offset; - } - return this; - } - } - return nullptr; -} - -MemoryChunk *MemoryChunk::insert(TensorInfo *tensor) -{ - int aligned_size = this->get_aligned_size(tensor->get_size()); - - if (this->is_free && this->size >= aligned_size) { - this->tensor = tensor; - this->is_free = false; - if (this->size > aligned_size) { - MemoryChunk *chunk = new MemoryChunk(this->size - aligned_size, true, this->alignment); - this->size = aligned_size; - chunk->offset = this->offset + aligned_size; - return chunk; - } - } - return nullptr; -} - -MemoryChunk *MemoryChunk::extend(TensorInfo *tensor) -{ - int aligned_size = this->get_aligned_size(tensor->get_size()); - - // only extend the size of memory chunk - if (this->is_free && this->size < aligned_size) { - this->tensor = tensor; - this->is_free = false; - this->size = aligned_size; - return this; - } - return nullptr; -} - -size_t MemoryChunk::get_aligned_size(size_t size) -{ - int remainder = size % this->alignment; - - if (remainder != 0) { - return size + this->alignment - remainder; - } - - return size; -} - -void print_memory_list(const char *tag, std::list &memory_list) -{ - for (auto it = memory_list.begin(); it != memory_list.end(); ++it) { - std::string name = ""; - std::string state = "false"; - if ((*it)->tensor) { - name = (*it)->tensor->get_name(); - } - if ((*it)->is_free) { - state = "true"; - } - ESP_LOGI(tag, - "[size:%d, offset:%d, free:%s, tensor:%s] -> ", - (*it)->size, - (*it)->offset, - state.c_str(), - name.c_str()); - } - printf("\n"); -} - -void sort_memory_list(std::list &memory_list) -{ - // sort free list by size - memory_list.sort([](MemoryChunk *a, MemoryChunk *b) { - return a->size < b->size; // 升序排序 - }); -} - -} // namespace memory -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/src/dl_memory_manager_greedy.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/src/dl_memory_manager_greedy.cpp deleted file mode 100644 index d341ac67..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/src/dl_memory_manager_greedy.cpp +++ /dev/null @@ -1,467 +0,0 @@ -#include - -#include "dl_memory_manager_greedy.hpp" -#include "esp_log.h" - -static const char *TAG = "MemoryManagerGreedy"; - -namespace dl { -namespace memory { - -// std::vector MemoryManagerGreedy::alloc(fbs::FbsModel *fbs_model, -// std::vector &execution_plan) -// { -// std::vector tensor_info; -// // get all tensor info from flatbuffers -// this->get_tensor_info_from_fbs(fbs_model, execution_plan, tensor_info); - -// // simulate the memory allocation -// this->simulate(tensor_info, execution_plan.size()); -// int psram_size = memory_list.back()->offset + memory_list.back()->size; - -// // start to allocate tensors -// void *psram_root = this->psram_root_calloc(psram_size); -// this->tensors.reserve(tensor_info.size()); -// for (int i = 0; i < tensor_info.size(); i++) { -// this->tensors.push_back(tensor_info[i]->create_tensor(nullptr, psram_root)); -// delete tensor_info[i]; -// } - -// // free memory list -// this->free_memory_list(); - -// return this->tensors; -// } - -std::vector MemoryManagerGreedy::alloc(fbs::FbsModel *fbs_model, - std::vector &execution_plan) -{ - std::vector tensor_info; - // get all tensor info from flatbuffers - this->get_tensor_info_from_fbs(fbs_model, execution_plan, tensor_info); - - // simulate the memory allocation - this->simulate_with_internal_memory(tensor_info, execution_plan.size()); - void *psram_root = nullptr; - if (!memory_list.empty()) { - int psram_size = memory_list.back()->offset + memory_list.back()->size; - psram_root = this->psram_root_calloc(psram_size); - } - - void *internal_root = nullptr; - if (!internal_memory_list.empty()) { - internal_root = this->internal_root_calloc(this->internal_size); - } - - // start to allocate tensors - this->tensors.reserve(tensor_info.size()); - for (int i = 0; i < tensor_info.size(); i++) { - this->tensors.push_back(tensor_info[i]->create_tensor(internal_root, psram_root)); - } - - // free TensorInfo vector - for (int i = 0; i < tensor_info.size(); i++) { - delete tensor_info[i]; - } - - // free memory list - this->free_memory_list(); - - return this->tensors; -} - -void MemoryManagerGreedy::free() -{ - if (!this->tensors.empty()) { - for (int i = 0; i < this->tensors.size(); ++i) { - delete tensors[i]; - } - this->tensors.clear(); - } - this->root_free(); - this->name2index.clear(); - this->free_memory_list(); -} - -void MemoryManagerGreedy::get_tensor_info_from_fbs(fbs::FbsModel *fbs_model, - std::vector execution_plan, - std::vector &tensor_info) -{ - // 1. add graph inputs - std::vector graph_inputs = fbs_model->get_graph_inputs(); - - for (int i = 0; i < graph_inputs.size(); i++) { - std::string name = graph_inputs[i]; - TensorInfo *info = new TensorInfo(name, - 0, - -1, - fbs_model->get_value_info_shape(name), - fbs_model->get_value_info_dtype(name), - fbs_model->get_value_info_exponent(name)); - tensor_info.push_back(info); - this->name2index.emplace(name, tensor_info.size() - 1); - } - - // 2. add tensor outputs and update time line of tensors - std::vector graph_outputs = fbs_model->get_graph_outputs(); - std::vector sorted_nodes = fbs_model->topological_sort(); - std::vector op_inputs; - std::vector op_outputs; - for (int i = 0; i < execution_plan.size(); i++) { - dl::module::Module *module = execution_plan[i]; - if (!module) { - ESP_LOGE(__FUNCTION__, "module %d is nullptr\n", i); - break; - } - - // update the time of tensor by node's inputs - std::vector> input_shapes; - fbs_model->get_operation_inputs_and_outputs(sorted_nodes[i], op_inputs, op_outputs); - - for (int j = 0; j < op_inputs.size(); j++) { - auto iter = this->name2index.find(op_inputs[j]); - if (iter != this->name2index.end()) { - // The previously existing tensor will dirty the input. Must disconnect the inplace link. - TensorInfo *follower_tensor = tensor_info[iter->second]->get_inplace_follower_tensor(); - if (follower_tensor) { - tensor_info[iter->second]->set_inplace_follower_tensor(nullptr); - follower_tensor->set_inplace_leader_tensor(nullptr); - } - - auto out_iter = std::find(graph_outputs.begin(), graph_outputs.end(), iter->first); - if (out_iter == graph_outputs.end()) - tensor_info[iter->second]->update_time(i + 1); // free this tensor next step - input_shapes.push_back(tensor_info[iter->second]->get_shape()); - module->m_inputs_index.push_back(iter->second); // assign input index of module - } - } - - // add output tensors - std::vector> output_shapes = module->get_output_shape(input_shapes); - if ((module->inplace == MODULE_INPLACE_UNCHANGED_BUFFER || module->inplace == MODULE_INPLACE_CHANGED_BUFFER) && - op_outputs.size() == 1) { - // inplace, assign first input tensor as output tensor - // TODO:: more accuracy inplace position - auto iter = name2index.find(op_inputs[0]); - if (iter != name2index.end()) { - std::string name = op_outputs[0]; - TensorInfo *inplace_tensor = tensor_info[iter->second]; - TensorInfo *info = new TensorInfo(name, - i, - -1, - output_shapes[0], - fbs_model->get_value_info_dtype(name), - fbs_model->get_value_info_exponent(name)); - - // If op_inputs[0] is graph output. It can't be set inplace. - auto out_iter = std::find(graph_outputs.begin(), graph_outputs.end(), iter->first); - if (out_iter == graph_outputs.end() && info->get_size() <= inplace_tensor->get_size()) { - TensorInfo *pre_follower_tensor = inplace_tensor->get_inplace_follower_tensor(); - // The previously existing tensor will dirty the input. Must disconnect the inplace link. - if (pre_follower_tensor) { - inplace_tensor->set_inplace_follower_tensor(nullptr); - pre_follower_tensor->set_inplace_leader_tensor(nullptr); - } - - // Relink the inplace. - info->set_inplace_leader_tensor(inplace_tensor); - if (module->inplace == MODULE_INPLACE_CHANGED_BUFFER) { - inplace_tensor->set_inplace_follower_tensor(info); - } - } - - tensor_info.push_back(info); - this->name2index.emplace(name, tensor_info.size() - 1); - module->m_outputs_index.push_back(tensor_info.size() - 1); // assign output index of module - } else { - ESP_LOGE(TAG, "input tensor %s not found, skip %s\n", op_inputs[0].c_str(), op_outputs[0].c_str()); - continue; - } - } else { - for (int j = 0; j < op_outputs.size(); j++) { - std::string name = op_outputs[j]; - TensorInfo *info = new TensorInfo(name, - i, - -1, - output_shapes[j], - fbs_model->get_value_info_dtype(name), - fbs_model->get_value_info_exponent(name)); - tensor_info.push_back(info); - this->name2index.emplace(name, tensor_info.size() - 1); - module->m_outputs_index.push_back(tensor_info.size() - 1); // assign output index of module - } - } - } -} - -void MemoryManagerGreedy::set_preload_addr(std::vector execution_plan) -{ - void *internal_root = this->get_internal_root(); - if (!internal_root) { - ESP_LOGW(TAG, "Internal root is nullptr, Ignore preload optimization\n"); - return; - } - - for (int i = 0; i < execution_plan.size(); i++) { - dl::module::Module *module = execution_plan[i]; - if (!module) { - ESP_LOGE(__FUNCTION__, "module %d is nullptr\n", i); - break; - } - module->set_preload_addr(internal_root, this->internal_size); - } -} - -int MemoryManagerGreedy::simulate(std::vector &tensor_info, int node_num) -{ - std::vector> node_alloc_tensors(node_num); - std::vector> node_free_tensors(node_num); - - for (int i = 0; i < node_num; i++) { - node_alloc_tensors[i] = {}; - node_free_tensors[i] = {}; - } - - for (int i = 0; i < tensor_info.size(); i++) { - // If this tensor is inplaced by other tensor, skip it - if (tensor_info[i]->is_inplaced()) { - continue; - } - - int time_begin = tensor_info[i]->get_time_begin(); - int time_end = tensor_info[i]->get_time_end(); - - if (time_begin >= 0 && time_begin < node_num) { - node_alloc_tensors[time_begin].push_back(tensor_info[i]); - } - - if (time_end >= 0 && time_end < node_num) { - node_free_tensors[time_end].push_back(tensor_info[i]); - } - } - - for (int i = 0; i < node_num; i++) { - for (auto it = node_free_tensors[i].begin(); it != node_free_tensors[i].end(); it++) { - free_tensor(*it, this->memory_list, this->free_list); - } - - for (auto it = node_alloc_tensors[i].begin(); it != node_alloc_tensors[i].end(); it++) { - alloc_tensor(*it); - } - - // print_memory_list("psram memory list:", this->memory_list); - // print_memory_list("psram free list:", this->free_list); - } - - size_t max_ram_size = memory_list.back()->offset + memory_list.back()->size; - ESP_LOGI(TAG, "Maximum mermory size: %d\n", max_ram_size); - - return max_ram_size; -} - -int MemoryManagerGreedy::simulate_with_internal_memory(std::vector &tensor_info, int node_num) -{ - if (this->internal_size > this->alignment) { - MemoryChunk *internal_chunk = new MemoryChunk(this->internal_size, true, this->alignment); - this->internal_memory_list.push_back(internal_chunk); - this->internal_free_list.push_back(internal_chunk); - } else { - return simulate(tensor_info, node_num); - } - std::vector> node_alloc_tensors(node_num); - std::vector> node_free_tensors(node_num); - - for (int i = 0; i < node_num; i++) { - node_alloc_tensors[i] = {}; - node_free_tensors[i] = {}; - } - - for (int i = 0; i < tensor_info.size(); i++) { - // If this tensor is inplaced by other tensor, skip it - if (tensor_info[i]->is_inplaced()) { - continue; - } - - int time_begin = tensor_info[i]->get_time_begin(); - int time_end = tensor_info[i]->get_time_end(); - - if (time_begin >= 0 && time_begin < node_num) { - node_alloc_tensors[time_begin].push_back(tensor_info[i]); - } - - if (time_end >= 0 && time_end < node_num) { - node_free_tensors[time_end].push_back(tensor_info[i]); - } - } - - for (int i = 0; i < node_num; i++) { - for (auto it = node_free_tensors[i].begin(); it != node_free_tensors[i].end(); it++) { - if ((*it)->get_internal_state()) { - free_tensor(*it, this->internal_memory_list, this->internal_free_list); - } else { - free_tensor(*it, this->memory_list, this->free_list); - } - } - - for (auto it = node_alloc_tensors[i].begin(); it != node_alloc_tensors[i].end(); it++) { - MemoryChunk *chunk = alloc_internal_tensor(*it); - if (chunk == nullptr) { - chunk = alloc_tensor(*it); - } - } - - // print_memory_list("psram memory list", this->memory_list); - // print_memory_list("psram free list", this->free_list); - // print_memory_list("internal memory list", this->internal_memory_list); - // print_memory_list("internal free list", this->internal_free_list); - } - - size_t psram_size = 0; - if (!memory_list.empty()) { - psram_size = memory_list.back()->offset + memory_list.back()->size; - } - size_t internal_ram_size = 0; - if (!internal_memory_list.empty()) { - internal_ram_size = internal_memory_list.back()->offset + internal_memory_list.back()->size; - } - ESP_LOGI(TAG, "Maximum psram size: %d, Maximum internal ram size: %d\n", psram_size, internal_ram_size); - - return psram_size + internal_ram_size; -} - -MemoryChunk *MemoryManagerGreedy::free_tensor(TensorInfo *tensor, - std::list &memory_list, - std::list &free_list) -{ - MemoryChunk *chunk = nullptr; - for (auto it = memory_list.begin(); it != memory_list.end(); ++it) { - chunk = *it; - if (chunk->tensor == tensor) { - chunk->free(); - - // merge with the next chunk - auto next_it = std::next(it, 1); - if (next_it != memory_list.end()) { - MemoryChunk *next_chunk = *next_it; - if (chunk->merge_free_chunk(next_chunk)) { - auto free_it = std::find(free_list.begin(), free_list.end(), next_chunk); - memory_list.erase(next_it); - free_list.erase(free_it); - delete next_chunk; - } - } - - // merge with the previous chunk - if (it != memory_list.begin()) { - auto prev_it = std::prev(it); - MemoryChunk *prev_chunk = *prev_it; - if (chunk->merge_free_chunk(prev_chunk)) { - auto free_it = std::find(free_list.begin(), free_list.end(), prev_chunk); - memory_list.erase(prev_it); - free_list.erase(free_it); - delete prev_chunk; - } - } - - // sort free list - free_list.push_back(chunk); - sort_memory_list(free_list); - return chunk; - } - } - return chunk; -} - -MemoryChunk *MemoryManagerGreedy::alloc_tensor(TensorInfo *tensor, int mode) -{ - // printf("alloc tensor:%s\n", tensor->name.c_str()); - MemoryChunk *chunk = nullptr; - for (auto it = free_list.begin(); it != free_list.end(); ++it) { - if ((*it)->size >= tensor->get_size()) { - // find a valid free memory chunk, split it and put the tensor into it - chunk = *it; - auto mem_it = std::find(memory_list.begin(), memory_list.end(), chunk); - MemoryChunk *split_chunk = chunk->insert(tensor); - free_list.erase(it); // remove this memory chunk in free list - if (split_chunk != nullptr) { - memory_list.insert(std::next(mem_it, 1), split_chunk); // add split memory chunk in memory list - free_list.push_back(split_chunk); // add split memory chunk in free list - sort_memory_list(free_list); // sort free list - } - break; - } - } - - if (chunk == nullptr) { - if (!memory_list.empty()) { - MemoryChunk *last_chunk = memory_list.back(); - if (last_chunk->is_free) { - auto last_it = std::find(free_list.begin(), free_list.end(), last_chunk); - free_list.erase(last_it); - sort_memory_list(free_list); - chunk = last_chunk->extend(tensor); - } - } - - if (chunk == nullptr) { - // add a new memory chunk - chunk = new MemoryChunk(tensor, this->alignment); - if (!memory_list.empty()) { - MemoryChunk *last = memory_list.back(); - chunk->offset = last->offset + last->size; - } - memory_list.push_back(chunk); - } - } - tensor->set_offset(chunk->offset); - return chunk; -} - -MemoryChunk *MemoryManagerGreedy::alloc_internal_tensor(TensorInfo *tensor, int mode) -{ - // printf("alloc tensor:%s\n", tensor->name.c_str()); - MemoryChunk *chunk = nullptr; - for (auto it = internal_free_list.begin(); it != internal_free_list.end(); ++it) { - if ((*it)->size >= tensor->get_size()) { - // find a valid free memory chunk, split it and put the tensor into it - chunk = *it; - auto mem_it = std::find(internal_memory_list.begin(), internal_memory_list.end(), chunk); - MemoryChunk *split_chunk = chunk->insert(tensor); - internal_free_list.erase(it); // remove this memory chunk in free list - if (split_chunk != nullptr) { - internal_memory_list.insert(std::next(mem_it, 1), split_chunk); // add split memory chunk in memory list - internal_free_list.push_back(split_chunk); // add split memory chunk in free list - sort_memory_list(internal_free_list); // sort free list - } - tensor->set_internal_offset(chunk->offset); - break; - } - } - - return chunk; -} - -void MemoryManagerGreedy::free_memory_list() -{ - if (!memory_list.empty()) { - for (auto it = memory_list.begin(); it != memory_list.end(); ++it) { - MemoryChunk *chunk = *it; - delete chunk; - } - memory_list.clear(); - free_list.clear(); - } - - if (!internal_memory_list.empty()) { - for (auto it = internal_memory_list.begin(); it != internal_memory_list.end(); ++it) { - MemoryChunk *chunk = *it; - delete chunk; - } - internal_memory_list.clear(); - internal_free_list.clear(); - } -} - -} // namespace memory - -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/src/dl_model_base.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/src/dl_model_base.cpp deleted file mode 100644 index b9d73af1..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/model/src/dl_model_base.cpp +++ /dev/null @@ -1,267 +0,0 @@ -#include - -#include "dl_memory_manager_greedy.hpp" -#include "dl_model_base.hpp" -#include "dl_module_creator.hpp" -#include "fbs_model.hpp" - -static const char *TAG = "dl::Model"; - -namespace dl { - -Model::Model(fbs::FbsModel *fbs_model, int internal_size, memory_manager_t mm_type) -{ - if (this->load(fbs_model) == ESP_OK) { - this->build(internal_size, mm_type); - } -} - -Model::~Model() -{ - if (fbs_loader) { - delete fbs_loader; - - // If fbs_loader is NULL, this means fbs_model is created by user, not by model. - if (fbs_model) { - delete fbs_model; - } - } - - if (memory_manager) { - delete memory_manager; - } - if (!execution_plan.empty()) { - for (int i = 0; i < execution_plan.size(); i++) { - delete execution_plan[i]; - } - } -} - -esp_err_t Model::load(const char *name, fbs::model_location_type_t location, int model_index, uint8_t *key) -{ - fbs_loader = new fbs::FbsLoader(name, location); - return this->load(fbs_loader->load(model_index, key)); -} - -esp_err_t Model::load(fbs::FbsModel *fbs_model) -{ - esp_err_t ret = ESP_OK; - if (!fbs_model) { - ESP_LOGE(TAG, "Fail to load model"); - ret = ESP_FAIL; - return ret; - } - this->fbs_model = fbs_model; // fbs_model is created by fbs_loader, so we don't need to delete it. - fbs_model->load_map(); - this->name = fbs_model->get_model_name(); - this->version = fbs_model->get_model_version(); - this->doc_string = fbs_model->get_model_doc_string(); - if (this->doc_string.empty()) { - ESP_LOGI(TAG, "model:%s, version:%lld\n", this->name.c_str(), this->version); - } else { - ESP_LOGI(TAG, - "model:%s, version:%lld, description:%s\n", - this->name.c_str(), - this->version, - this->doc_string.c_str()); - } - - // Construct the execution plan. - execution_plan.clear(); - dl::module::ModuleCreator *module_creator = dl::module::ModuleCreator::get_instance(); - - std::vector sorted_nodes = fbs_model->topological_sort(); - for (int i = 0; i < sorted_nodes.size(); i++) { - std::string node_name = sorted_nodes[i]; - std::string op_type = fbs_model->get_operation_type(node_name); - ESP_LOGI(TAG, "%s: %s", node_name.c_str(), op_type.c_str()); - if (op_type.empty()) { - ESP_LOGE(TAG, "Can not find the operation %s", node_name.c_str()); - ret = ESP_FAIL; - break; - } - dl::module::Module *module = module_creator->create(fbs_model, op_type, node_name); - if (!module) { - ESP_LOGE(TAG, "Do not support %s, please implement and register it first.", op_type.c_str()); - ret = ESP_FAIL; - break; - } - execution_plan.push_back(module); - } - - this->memory_manager = nullptr; - return ret; -} - -void Model::build(size_t internal_size, memory_manager_t mm_type, bool preload) -{ - int max_available_internal_size = heap_caps_get_largest_free_block(MALLOC_CAP_INTERNAL) * 0.8; - if (internal_size > max_available_internal_size) { - ESP_LOGW(TAG, "The maximum available internal memory is %d", max_available_internal_size); - internal_size = max_available_internal_size; - } - - // If memory manager has been created, delete it and reset all modules - this->fbs_model->load_map(); - if (this->memory_manager) { - delete this->memory_manager; - for (int i = 0; i < execution_plan.size(); i++) { - dl::module::Module *module = execution_plan[i]; - if (module) { - module->reset(); - } - } - } - - if (mm_type == MEMORY_MANAGER_GREEDY) { - this->memory_manager = new dl::memory::MemoryManagerGreedy(internal_size); - } - this->memory_manager->alloc(this->fbs_model, this->execution_plan); - - // get the TensorBase* of inputs and outputs - std::vector inputs_tmp = fbs_model->get_graph_inputs(); - std::vector outputs_tmp = fbs_model->get_graph_outputs(); - this->inputs.clear(); - this->outputs.clear(); - for (int i = 0; i < inputs_tmp.size(); i++) { - TensorBase *input_tensor = this->get_intermediate(inputs_tmp[i]); - this->inputs.emplace(inputs_tmp[i], input_tensor); - } - for (int i = 0; i < outputs_tmp.size(); i++) { - TensorBase *output_tensor = this->get_intermediate(outputs_tmp[i]); - this->outputs.emplace(outputs_tmp[i], output_tensor); - } - - this->fbs_model->clear_map(); -} - -void Model::run(runtime_mode_t mode) -{ - // execute each module. - for (int i = 0; i < execution_plan.size(); i++) { - dl::module::Module *module = execution_plan[i]; - if (module) { - module->forward(this->memory_manager->tensors, mode); - } else { - break; - } - } -} - -void Model::run(TensorBase *input, runtime_mode_t mode) -{ - if (this->inputs.size() != 1) { - ESP_LOGW(TAG, "The inputs of model is not jsut one! This API will assign data to first input"); - } - - TensorBase *model_input = this->inputs.begin()->second; - if (!model_input->assign(input)) { - ESP_LOGE(TAG, "Assign input failed"); - return; - } - - // execute each module. - for (int i = 0; i < execution_plan.size(); i++) { - dl::module::Module *module = execution_plan[i]; - if (module) { - // ESP_LOGI(TAG, "module: %d\n", i); - module->forward(this->memory_manager->tensors, mode); - } else { - break; - } - } -} - -void Model::run(std::map &user_inputs, - runtime_mode_t mode, - std::map user_outputs) -{ - if (user_inputs.size() != this->inputs.size()) { - ESP_LOGE(TAG, - "The size of user_inputs(%d) don't equal with the size of model inputs(%d).", - user_inputs.size(), - this->inputs.size()); - return; - } - - for (auto user_inputs_iter = user_inputs.begin(); user_inputs_iter != user_inputs.end(); user_inputs_iter++) { - std::string user_input_name = user_inputs_iter->first; - TensorBase *user_input_tensor = user_inputs_iter->second; - auto graph_input_iter = this->inputs.find(user_input_name); - if (graph_input_iter == this->inputs.end()) { - ESP_LOGE(TAG, "The input name(%s) isn't graph input.", user_input_name.c_str()); - return; - } - TensorBase *graph_input_tensor = graph_input_iter->second; - if (!graph_input_tensor->assign(user_input_tensor)) { - ESP_LOGE(TAG, "Assign input failed"); - return; - } - } - - // execute each module. - for (int i = 0; i < execution_plan.size(); i++) { - dl::module::Module *module = execution_plan[i]; - if (module) { - module->forward(this->memory_manager->tensors, mode); - // get the intermediate tensor for debug. - if (!user_outputs.empty()) { - for (auto user_outputs_iter = user_outputs.begin(); user_outputs_iter != user_outputs.end(); - user_outputs_iter++) { - int user_tensor_index = - this->memory_manager->get_tensor_index(const_cast(user_outputs_iter->first)); - if (user_tensor_index >= 0) { - std::vector outputs_index = module->get_outputs_index(); - for (int i = 0; i < outputs_index.size(); i++) { - if (user_tensor_index == outputs_index[i]) { - user_outputs_iter->second->assign(this->memory_manager->tensors[user_tensor_index]); - break; - } - } - } - } - } - } else { - break; - } - } - return; -} - -std::map &Model::get_inputs() -{ - return this->inputs; -} - -TensorBase *Model::get_intermediate(std::string name) -{ - if (name.empty()) { - ESP_LOGE(TAG, "Invalid name."); - return nullptr; - } - return this->memory_manager->get_tensor(name); -} - -std::map &Model::get_outputs() -{ - return this->outputs; -} - -void Model::print() -{ - if (!execution_plan.empty()) { - for (int i = 0; i < execution_plan.size(); i++) { - if (execution_plan[i]) { - ESP_LOGI(TAG, "------------------------------- %d -------------------------------\n", i); - if (execution_plan[i]) { - execution_plan[i]->print(); - } else { - break; - } - } - } - ESP_LOGI(TAG, "-------------------------------------------------------------\n"); - } -} - -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_add.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_add.hpp deleted file mode 100644 index 9dea9fd4..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_add.hpp +++ /dev/null @@ -1,105 +0,0 @@ -#pragma once - -#include "dl_base_add2d.hpp" -#include "dl_base_shape.hpp" -#include "dl_module_base.hpp" - -namespace dl { -namespace module { -/** - * NOTE: addition is element-wise, i.e., output[i,j,k] = input0[i,j,k] + input1[i,j,k] - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -class Add2D : public Module { -public: - /** - * @brief Construct a new Add2D object. - * - * @param name name of module - * @param inplace inplace type. - */ - Add2D(const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - } - - /** - * @brief Destroy the Add2D object. - */ - ~Add2D() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 2); - - // support multidirectional broadcasting - std::vector output_shape = base::get_multidirectional_broadcasting_shape(input_shapes[0], input_shapes[1]); - - return std::vector>(1, output_shape); - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - // DL_LOG_LAYER_LATENCY_INIT(); - // DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } - // DL_LOG_LAYER_LATENCY_END(this->name, "Add2D"); - } - - void forward_args(void *args) - { - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - base::add2d(args); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - base::add2d(args); - } - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input0 = tensors[m_inputs_index[0]]; - TensorBase *input1 = tensors[m_inputs_index[1]]; - TensorBase *output = tensors[m_outputs_index[0]]; - - std::vector> m_args = - base::get_arith_operation_args(output, input0, input1, Linear, nullptr, mode); - int task_size = m_args.size(); - if (task_size == 1) { // single task - forward_args((void *)&m_args[0]); - } else if (task_size == 2) { // multi task, use semaphore to maintain synchronization. - module_forward_dual_core(this, (void *)&m_args[0], (void *)&m_args[1]); - } else { - ESP_LOGE("Add2D", "Only support task size is 1 or 2, currently task size is %d", task_size); - } - } - - /** - * @brief deserialize Add module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - - // Create module - if (quant_type == QUANT_TYPE_SYMM_8BIT || quant_type == QUANT_TYPE_SYMM_16BIT) { - op = new Add2D(NULL, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } - return op; - } - - void print() { ESP_LOGI("Add2D", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_avg_pool2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_avg_pool2d.hpp deleted file mode 100644 index 9ee7617a..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_avg_pool2d.hpp +++ /dev/null @@ -1,144 +0,0 @@ -#pragma once - -#include "dl_base_avg_pool2d.hpp" -#include "dl_module_base.hpp" - -namespace dl { -namespace module { -/** - * NOTE: - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -class AveragePool2D : public Module { -private: - std::vector filter_shape; /**/ - std::vector padding; /**/ - const int stride_y; /**/ - const int stride_x; /**/ -public: - /** - * @brief Construct a new AveragePool2D object. - * - * @param name name of module - * @param filter_shape filter shape in [height, width] - * @param padding padding size needed in [top, bottom, left, right] of this operation - * @param stride_y stride in height - * @param stride_x stride in width - */ - AveragePool2D(const char *name = NULL, - const std::vector &filter_shape = {2, 2}, - const std::vector &padding = {}, - const int stride_y = 1, - const int stride_x = 1, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, MODULE_NON_INPLACE, quant_type), - filter_shape(filter_shape), - padding(padding), - stride_y(stride_y), - stride_x(stride_x) - { - } - - /** - * @brief Destroy the AveragePool2D object. - */ - ~AveragePool2D() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 1); - assert(input_shapes[0].size() == 4); - int *input_shape = input_shapes[0].data(); - std::vector output_shape(4); - - output_shape[0] = input_shape[0]; - output_shape[1] = (input_shape[1] + padding[0] + padding[1] - filter_shape[0]) / stride_y + 1; - output_shape[2] = (input_shape[2] + padding[2] + padding[3] - filter_shape[1]) / stride_x + 1; - output_shape[3] = input_shape[3]; - - std::vector> output_shapes(1, output_shape); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } - DL_LOG_LAYER_LATENCY_END(this->name, "AveragePool2D"); - } - - void forward_args(void *args) - { - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - base::avg_pool2d(args); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - base::avg_pool2d(args); - } - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - - std::vector> m_args = base::get_pool_args( - output, input, this->padding, this->filter_shape, this->stride_y, this->stride_x, mode); - int task_size = m_args.size(); - if (task_size == 1) { // single task - forward_args((void *)&m_args[0]); - } else if (task_size == 2) { // multi task, use semaphore to maintain synchronization. - module_forward_dual_core(this, (void *)&m_args[0], (void *)&m_args[1]); - } else { - ESP_LOGE("AveragePool2D", "Only support task size is 1 or 2, currently task size is %d", task_size); - } - } - - /** - * @brief deserialize AveragePool2D module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - std::vector kernel_shape; - std::vector pads; - std::vector strides; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "kernel_shape", kernel_shape); - fbs_model->get_operation_attribute(node_name, "pads", pads); - fbs_model->get_operation_attribute(node_name, "strides", strides); - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - - // Create module - if (quant_type == QUANT_TYPE_SYMM_8BIT || quant_type == QUANT_TYPE_SYMM_16BIT) { - op = new AveragePool2D(node_name.c_str(), - kernel_shape, - {pads[0], pads[2], pads[1], pads[3]}, - strides[0], - strides[1], - quant_type); - } - return op; - } - - void print() - { - ESP_LOGI("AveragePool2D", - "quant_type: %s, kernel size: %s, pads size: %s, strides size: [%d, %d]", - quant_type_to_string(quant_type), - shape_to_string(filter_shape).c_str(), - shape_to_string(padding).c_str(), - stride_y, - stride_x); - } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_base.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_base.hpp deleted file mode 100644 index 19fc917e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_base.hpp +++ /dev/null @@ -1,195 +0,0 @@ -#pragma once -#include "dl_base.hpp" -#include "dl_define.hpp" -#include "dl_tensor_base.hpp" -#include "dl_tool.hpp" -#include "dl_tool_cache.hpp" -#include "fbs_model.hpp" -#include -#include - -namespace dl { -typedef enum { - MODULE_NON_INPLACE = 0, /**/ - MODULE_INPLACE_UNCHANGED_BUFFER = 1, /**/ - MODULE_INPLACE_CHANGED_BUFFER = 2 /**/ -} module_inplace_t; - -namespace module { -/** - * @brief Base class for module. - */ -class Module { -public: - char *name; /**/ - module_inplace_t inplace; - quant_type_t quant_type; - std::vector m_inputs_index; /**/ - std::vector m_outputs_index; /**/ - - /** - * @brief Construct a new Module object. - * - * @param name name of module. - */ - Module(const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE); - - /** - * @brief Destroy the Module object. Return resource. - * - */ - virtual ~Module(); - - /** - * @brief get the tensor index of this module's outputs - * - * @return tensor index of model's tensors - */ - virtual std::vector get_outputs_index() { return m_outputs_index; } - - /** - * @brief calculate output shape by input shape - * - * @param input_shapes input shapes - * - * @return outputs shapes - */ - virtual std::vector> get_output_shape(std::vector> &input_shapes) = 0; - - /** - * @brief Run the module, high-level inferface for model layer - * - * @param tensors All inputs and outputs from MemoryManager - * @param assign_core not effective yet - * - */ - virtual void forward(std::vector &tensors, runtime_mode_t mode = RUNTIME_MODE_AUTO) = 0; - - /** - * @brief Run the module, Low-level interface for base layer and multi-core processing - * - * @param args ArgsType, arithArgsType, resizeArgsType and so on - */ - virtual void forward_args(void *args) = 0; - - /** - * @brief create module instance by node serialization information - * - * @param fbs_model Flatbuffer's model - * @param node_name The node name in model's graph - * - * @return The pointer of module instance - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) { return nullptr; } - - /** - * @brief print module information - */ - virtual void print() {} - - /** - * @brief set preload RAM pointer - * - * @param addr Internal RAM address, should be aligned to 16 bytes - * @param size The size of RAM address - * - */ - virtual void set_preload_addr(void *addr, size_t size) {} - - /** - * @brief perform a preload operation - */ - virtual void preload() {} - - /** - * @brief reset all state of module, include inputs, outputs and preload cache setting - */ - virtual void reset() - { - this->m_inputs_index.clear(); - this->m_outputs_index.clear(); - } -}; - -/** - * @brief The data struct of module task. Pack all necessary information as the input for module task. - */ -typedef struct { - Module *op; - void *args; - SemaphoreHandle_t &semaphore; // recommend xSemaphoreCreateCounting -} module_task_data_t; - -/** - * @brief The function of module task. - * @param args The data of module task. - */ -static void module_forward_task(void *args) -{ - module_task_data_t *task = (module_task_data_t *)args; - task->op->forward_args(task->args); - xSemaphoreGive(task->semaphore); - vTaskDelete(NULL); -} -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-function" -/** - * @brief Run the module with dual core and use semaphores to keep tasks in sync - * - * @param op Module instance - * @param args1 Task1 args: ArgsType, arithArgsType, resizeArgsType and so on - * @param args2 Task2 args: ArgsType, arithArgsType, resizeArgsType and so on - */ -static void module_forward_dual_core(Module *op, void *args1, void *args2) -{ - BaseType_t current_core_id = xPortGetCoreID(); - UBaseType_t current_priority = uxTaskPriorityGet(xTaskGetCurrentTaskHandle()); - SemaphoreHandle_t semaphore = xSemaphoreCreateCounting(2, 0); - module_task_data_t task_data1 = { - .op = op, - .args = args1, - .semaphore = semaphore, - }; - xTaskCreatePinnedToCore( - module_forward_task, NULL, 2048, &task_data1, current_priority, NULL, (current_core_id + 1) % 2); - - module_task_data_t task_data2 = { - .op = op, - .args = args2, - .semaphore = semaphore, - }; - xTaskCreatePinnedToCore(module_forward_task, NULL, 2048, &task_data2, current_priority, NULL, current_core_id); - - xSemaphoreTake(semaphore, portMAX_DELAY); - xSemaphoreTake(semaphore, portMAX_DELAY); - vSemaphoreDelete(semaphore); -} -#pragma GCC diagnostic pop - -#if DL_LOG_LAYER_LATENCY -/** - * @brief Initialize. - */ -#define DL_LOG_LAYER_LATENCY_INIT() dl::tool::Latency latency - -/** - * @brief Time starts. - */ -#define DL_LOG_LAYER_LATENCY_START() latency.start() - -/** - * @brief Time ends and printed. - */ -#define DL_LOG_LAYER_LATENCY_END(prefix, key) \ - latency.end(); \ - latency.print(prefix, key) -#else -#define DL_LOG_LAYER_LATENCY_INIT() -#define DL_LOG_LAYER_LATENCY_START() -#define DL_LOG_LAYER_LATENCY_END(prefix, key) -#endif - -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_clip.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_clip.hpp deleted file mode 100644 index c2fc52de..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_clip.hpp +++ /dev/null @@ -1,100 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" - -namespace dl { -namespace module { - -class Clip : public Module { -private: - TensorBase *m_min; /**/ - TensorBase *m_max; /**/ - -public: - /** - * @brief Construct a new Clip object. - * - * @param name name of module - * @param inplace inplace type. - */ - Clip(TensorBase *min, - TensorBase *max, - const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type), m_min(min), m_max(max) - { - } - - /** - * @brief Destroy the Clip object. - */ - ~Clip() - { - if (m_min) { - delete m_min; - } - - if (m_max) { - delete m_max; - } - } - - std::vector> get_output_shape(std::vector> &input_shapes) - { - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_FLOAT32) { - forward_template(tensors, mode); - } - DL_LOG_LAYER_LATENCY_END(this->name, "Clip"); - } - - void forward_args(void *args) {} - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - assert(input->get_size() == output->get_size()); - T min_value = *static_cast(m_min->get_element_ptr()); - T max_value = *static_cast(m_max->get_element_ptr()); - - T *src_data = static_cast(input->get_element_ptr()); - T *data = static_cast(output->get_element_ptr()); - for (int i = 0; i < input->get_size(); i++) { - data[i] = DL_CLIP(src_data[i], min_value, max_value); - } - } - - /** - * @brief deserialize Clip module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *min = fbs_model->get_operation_parameter(node_name, 1); - TensorBase *max = fbs_model->get_operation_parameter(node_name, 2); - - // Create module - op = new Clip(min, max, node_name.c_str(), MODULE_INPLACE_CHANGED_BUFFER, quant_type); - return op; - } - - void print() { ESP_LOGI("Clip", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_concat.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_concat.hpp deleted file mode 100644 index 1f7d7cef..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_concat.hpp +++ /dev/null @@ -1,163 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" - -namespace dl { -namespace module { -/** - * NOTE: - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -class Concat : public Module { -private: - int axis; /**/ - int n_dims; /**/ - int n_inputs; /**/ - int loop_times; - std::vector copy_nums; - -public: - /** - * @brief Construct a new Concat object. - * - * @param name name of module - */ - Concat(const char *name = NULL, int axis = 0, quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, MODULE_NON_INPLACE, quant_type), axis(axis) - { - } - - /** - * @brief Destroy the Concat object. - */ - ~Concat() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - this->n_inputs = input_shapes.size(); - assert(this->n_inputs > 1); - this->n_dims = input_shapes[0].size(); - - if (this->axis < 0) - this->axis += this->n_dims; - assert(this->axis >= 0 && this->axis < this->n_dims); - - int output_axis_dim = 0; - this->loop_times = 1; - this->copy_nums.assign(this->n_inputs, 1); - for (size_t i = 0; i < this->n_inputs; i++) { - assert(input_shapes[i].size() == this->n_dims); - for (size_t j = 0; j < this->n_dims; j++) { - if (i == 0 && j < this->axis) { - this->loop_times *= input_shapes[0][j]; - } - if (i > 0 && j != this->axis) { - assert(input_shapes[i][j] == input_shapes[i - 1][j]); - } - if (j >= this->axis) { - this->copy_nums[i] *= input_shapes[i][j]; - } - } - output_axis_dim += input_shapes[i][this->axis]; - } - - std::vector output_shape(input_shapes[0]); - output_shape[this->axis] = output_axis_dim; - std::vector> output_shapes(1, output_shape); - - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } - DL_LOG_LAYER_LATENCY_END(this->name, "Concat"); - } - - void forward_args(void *args) {} - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *output = tensors[m_outputs_index[0]]; - T *output_ptr = (T *)output->get_element_ptr(); - - std::vector inputs_ptr(this->n_inputs); - for (size_t i = 0; i < this->n_inputs; i++) { - TensorBase *input = tensors[m_inputs_index[i]]; - inputs_ptr[i] = (T *)input->get_element_ptr(); - } - - for (size_t i = 0; i < this->loop_times; i++) { - for (size_t j = 0; j < this->n_inputs; j++) { - tool::copy_memory(output_ptr, inputs_ptr[j], sizeof(T) * this->copy_nums[j]); - output_ptr += copy_nums[j]; - inputs_ptr[j] += copy_nums[j]; - } - } - } - - /** - * @brief deserialize Concat module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - int axis; - std::vector output_shape; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "axis", axis); - fbs_model->get_operation_output_shape(node_name, 0, output_shape); - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - - // if (output_shape.size() == 4) - // { - // assert (axis > 0 && axis < 4); - // // n c h w => h w c - // switch (axis){ - // case 1: axis = 2; - // break; - // case 2: axis = 0; - // break; - // case 3: axis = 1; - // break; - // } - // } - // else if (output_shape.size() == 3) - // { - // assert (axis > 0 && axis < 3); - // // n c w => w c - // switch (axis){ - // case 1: axis = 1; - // break; - // case 2: axis = 0; - // break; - // } - // } - // else if (output_shape.size() == 2) - // { - // // n c => c - // assert (axis == 1); - // axis = 0; - // } - - // Create module - if (quant_type == QUANT_TYPE_SYMM_8BIT || quant_type == QUANT_TYPE_SYMM_16BIT) { - op = new Concat(node_name.c_str(), axis, quant_type); - } - return op; - } - - void print() { ESP_LOGI("Concat", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_conv2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_conv2d.hpp deleted file mode 100644 index 12f70cd5..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_conv2d.hpp +++ /dev/null @@ -1,263 +0,0 @@ -#pragma once - -#include "dl_base_conv2d.hpp" -#include "dl_base_depthwise_conv2d.hpp" -#include "dl_module_base.hpp" -#include -#include "freertos/FreeRTOS.h" -#include "freertos/task.h" - -namespace dl { -namespace module { - -/** - * @brief Activation(Conv2D(input, filter) + bias). - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -class Conv2D : public Module { -private: - TensorBase *filter; /**/ - TensorBase *bias; /**/ - const int stride_y; /**/ - const int stride_x; /**/ - const int dilation_y; /**/ - const int dilation_x; /**/ - const int group; - activation_type_t activation; /**/ - std::vector padding; /**/ - -public: - /** - * @brief Construct a new Conv2D object. - * - * @param filter filter of Conv2D - * @param bias bias of Conv2D, if you don't specify anything, no bias is added - * @param activation activation of Conv2D, if you don't specify anything, no activation is applied - * @param padding the shape must be 4, the value of each position is: [padding top, padding bottom, padding - * left, padding right] - * @param stride_y stride in height - * @param stride_x stride in width - * @param group group of Conv - * @param name name of module - */ - Conv2D(TensorBase *filter, - TensorBase *bias = NULL, - activation_type_t activation = Linear, - std::vector padding = {}, - const int stride_y = 1, - const int stride_x = 1, - const int dilation_y = 1, - const int dilation_x = 1, - const char *name = NULL, - const int group = 1, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, MODULE_NON_INPLACE, quant_type), - filter(filter), - bias(bias), - stride_y(stride_y), - stride_x(stride_x), - dilation_y(dilation_y), - dilation_x(dilation_x), - group(group), - activation(activation), - padding(padding) - { - } - - /** - * @brief Destroy the Conv2D object. - * - */ - ~Conv2D() - { - if (filter) { - delete filter; - } - - if (bias) { - delete bias; - } - } - - /** - * @brief Calculate the output shape - * - * @param input_shape The shape of inputs - * - * @return output shape - */ - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 1); - assert(input_shapes[0].size() == 4); - int *input_shape = input_shapes[0].data(); - int *filter_shape = filter->shape.data(); - std::vector output_shape(4); - - // refer to https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html - output_shape[0] = input_shape[0]; - output_shape[1] = - (input_shape[1] + padding[0] + padding[1] - dilation_y * (filter_shape[0] - 1) - 1) / stride_y + 1; - output_shape[2] = - (input_shape[2] + padding[2] + padding[3] - dilation_x * (filter_shape[1] - 1) - 1) / stride_x + 1; - output_shape[3] = group == 1 ? filter_shape[3] : input_shape[3]; - - std::vector> output_shapes(1, output_shape); - return output_shapes; - } - - void forward_args(void *args) - { - if (group == 1) { - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - base::conv2d(args); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - base::conv2d(args); - } - } else { - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - base::depthwise_conv2d(args); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - // base::depthwise_conv2d(args); - } - } - } - - void forward(std::vector &tensors, runtime_mode_t mode = RUNTIME_MODE_AUTO) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } - DL_LOG_LAYER_LATENCY_END(this->name, "Conv2d"); - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - - std::vector> m_args = - base::get_conv_operation_args(output, - input, - this->padding, - this->filter, - this->stride_y, - this->stride_x, - this->dilation_y, - this->dilation_x, - this->group, - this->bias, - this->activation, - nullptr, - mode); // do not support RReLU and Leaky RelU - int task_size = m_args.size(); - if (task_size == 1) { // single task - forward_args((void *)&m_args[0]); - } else if (task_size == 2) { // multi task, use semaphore to maintain synchronization. - module_forward_dual_core(this, (void *)&m_args[0], (void *)&m_args[1]); - } else { - ESP_LOGE("Conv2D", "Only support task size is 1 or 2, currently task size is %d", task_size); - } - } - - /** - * @brief deserialize Conv2d module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *conv2d_op = nullptr; - - std::vector pads; - std::vector strides; - std::vector dilations; - int group = 1; - activation_type_t activation_type; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "pads", pads); - fbs_model->get_operation_attribute(node_name, "strides", strides); - fbs_model->get_operation_attribute(node_name, "dilations", dilations); - fbs_model->get_operation_attribute(node_name, "group", group); - fbs_model->get_operation_attribute(node_name, "activation", activation_type); - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - - // Create module - if (quant_type == QUANT_TYPE_SYMM_8BIT || quant_type == QUANT_TYPE_SYMM_16BIT) { - TensorBase *filter = fbs_model->get_operation_parameter(node_name, 1); - TensorBase *bias = fbs_model->get_operation_parameter(node_name, 2); - if (bias) { - bias->reset_bias_layout(quant_type, group != 1); - } - - conv2d_op = new Conv2D(filter, - bias, - activation_type, - {pads[0], pads[2], pads[1], pads[3]}, - strides[0], - strides[1], - dilations[0], - dilations[1], - node_name.c_str(), - group, - quant_type); - } - - return conv2d_op; - } - - void print() - { - ESP_LOGI("Conv2d", - "filter:%s, bias:%s, pads: %s, strides: [%d,%d], dilations: [%d,%d], group: %d, activation: %s, " - "quant_type: %s.", - shape_to_string(filter->shape).c_str(), - bias == nullptr ? "false" : "true", - shape_to_string(padding).c_str(), - stride_y, - stride_x, - dilation_y, - dilation_x, - group, - activation_type_to_string(activation), - quant_type_to_string(quant_type)); - } - - // void set_preload_addr(void *addr, size_t size) - // { - // size_t offset = 0; - // if (this->filter) { - // offset = this->filter->set_preload_addr(addr, size); - // } - // if (this->bias) { - // this->bias->set_preload_addr((void *)((char *)addr + offset), size - offset); - // } - // } - - // void preload() - // { - // // printf("preload filter and bias!"); - // if (filter) - // filter->preload(); - // if (bias) - // bias->preload(); - // } - - // void reset() - // { - // this->m_inputs_index.clear(); - // this->m_outputs_index.clear(); - // this->filter->cache = nullptr; - // if (this->bias != nullptr) { - // this->bias->cache = nullptr; - // } - // } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_creator.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_creator.hpp deleted file mode 100644 index 8fe9fdd4..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_creator.hpp +++ /dev/null @@ -1,113 +0,0 @@ -#pragma once -#include "dl_module_add.hpp" -#include "dl_module_avg_pool2d.hpp" -#include "dl_module_clip.hpp" -#include "dl_module_concat.hpp" -#include "dl_module_conv2d.hpp" -#include "dl_module_exp.hpp" -#include "dl_module_flatten.hpp" -#include "dl_module_gemm.hpp" -#include "dl_module_global_avg_pool2d.hpp" -#include "dl_module_hardsigmoid.hpp" -#include "dl_module_hardswish.hpp" -#include "dl_module_leakyrelu.hpp" -#include "dl_module_log.hpp" -#include "dl_module_lut.hpp" -#include "dl_module_mul.hpp" -#include "dl_module_prelu.hpp" -#include "dl_module_relu.hpp" -#include "dl_module_requantize_linear.hpp" -#include "dl_module_reshape.hpp" -#include "dl_module_resize2d.hpp" -#include "dl_module_sigmoid.hpp" -#include "dl_module_sqrt.hpp" -#include "dl_module_squeeze.hpp" -#include "dl_module_tanh.hpp" -#include "dl_module_transpose.hpp" -#include "dl_module_unsqueeze.hpp" -#include "fbs_loader.hpp" -#include -#include -#include -namespace dl { -namespace module { -class ModuleCreator { -public: - using Creator = std::function; - - static ModuleCreator *get_instance() - { - // This is thread safe for C++11, please refer to `Meyers' implementation of the Singleton pattern` - static ModuleCreator instance; - return &instance; - } - - void register_module(const std::string &op_type, Creator creator) { ModuleCreator::creators[op_type] = creator; } - - Module *create(fbs::FbsModel *fbs_model, const std::string &op_type, const std::string name) - { - this->register_dl_modules(); - - if (creators.find(op_type) != creators.end()) { - return creators[op_type](fbs_model, name); - } - return nullptr; - } - - void register_dl_modules() - { - if (creators.empty()) { - this->register_module("Conv", Conv2D::deserialize); - this->register_module("Mul", Mul2D::deserialize); - this->register_module("Add", Add2D::deserialize); - this->register_module("Resize", Resize2D::deserialize); - this->register_module("GlobalAveragePool", GlobalAveragePool2D::deserialize); - this->register_module("AveragePool", AveragePool2D::deserialize); - this->register_module("Concat", Concat::deserialize); - this->register_module("Sigmoid", Sigmoid::deserialize); - this->register_module("Tanh", Tanh::deserialize); - this->register_module("Relu", Relu::deserialize); - this->register_module("LeakyRelu", LeakyRelu::deserialize); - this->register_module("HardSigmoid", HardSigmoid::deserialize); - this->register_module("HardSwish", HardSwish::deserialize); - this->register_module("Gelu", LUT::deserialize); - this->register_module("Elu", LUT::deserialize); - this->register_module("LUT", LUT::deserialize); - this->register_module("Gemm", Gemm::deserialize); - this->register_module("QuantizeLinear", RequantizeLinear::deserialize); - this->register_module("DequantizeLinear", RequantizeLinear::deserialize); - this->register_module("RequantizeLinear", RequantizeLinear::deserialize); - this->register_module("PRelu", PRelu::deserialize); - this->register_module("Clip", Clip::deserialize); - this->register_module("Flatten", Flatten::deserialize); - this->register_module("Reshape", Reshape::deserialize); - this->register_module("Transpose", Transpose::deserialize); - this->register_module("Exp", Exp::deserialize); - this->register_module("Log", Log::deserialize); - this->register_module("Sqrt", Sqrt::deserialize); - this->register_module("Squeeze", Squeeze::deserialize); - this->register_module("Unsqueeze", Unsqueeze::deserialize); - } - } - - void print() - { - if (!creators.empty()) { - for (auto it = creators.begin(); it != creators.end(); ++it) { - printf("%s", (*it).first.c_str()); - } - } else { - printf("Create empty module\n"); - } - } - -private: - ModuleCreator() {} - ~ModuleCreator() {} - ModuleCreator(const ModuleCreator &) = delete; - ModuleCreator &operator=(const ModuleCreator &) = delete; - std::map creators; -}; - -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_exp.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_exp.hpp deleted file mode 100644 index f980c23a..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_exp.hpp +++ /dev/null @@ -1,104 +0,0 @@ -#pragma once - -#include "dl_math.hpp" -#include "dl_module_base.hpp" -#include "dl_module_lut.hpp" - -namespace dl { -namespace module { - -/** - * @brief: Calculates the exponential of the given input tensor, element-wise. - * Supports float, int16_t and int8_t - */ -class Exp : public Module { -public: - /** - * @brief Construct a new Exp object. - * - * @param name name of module - * @param inplace inplace type. - */ - Exp(const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - } - - /** - * @brief Destroy the Exp object. - */ - ~Exp() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode = RUNTIME_MODE_AUTO) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_FLOAT32) { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - float *input_ptr = (float *)input->get_element_ptr(); - float *output_ptr = (float *)output->get_element_ptr(); - - for (size_t i = 0; i < input->size; i++) { - output_ptr[i] = expf(input_ptr[i]); - } - } - DL_LOG_LAYER_LATENCY_END(this->name, "Exp"); - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - T *input_ptr = (T *)input->get_element_ptr(); - T *output_ptr = (T *)output->get_element_ptr(); - - float input_scale = DL_SCALE(input->exponent); - float output_scale = DL_RESCALE(output->exponent); - for (size_t i = 0; i < input->size; i++) { - float temp = input_ptr[i] * input_scale; - temp = expf(temp); - tool::truncate(output_ptr[i], tool::round(temp * output_scale)); - } - } - - void forward_args(void *args) {} - - /** - * @brief deserialize Exp module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *table = fbs_model->get_operation_lut(node_name); - - // Create module - if (table != NULL) { - op = new LUT(node_name.c_str(), table, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } else { - op = new Exp(node_name.c_str(), MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } - op->print(); - - return op; - } - - void print() { ESP_LOGI("Exp", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_flatten.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_flatten.hpp deleted file mode 100644 index 6c37f8db..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_flatten.hpp +++ /dev/null @@ -1,94 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" - -namespace dl { -namespace module { - -// https://onnx.ai/onnx/operators/onnx__Flatten.html -class Flatten : public Module { -private: - int m_axis; /**/ - -public: - /** - * @brief Construct a new Flatten object. - * - * @param name name of module - * @param inplace true: the output will store to input0 - * false: the output will store to a separate memory - */ - Flatten(int axis, - const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type), m_axis(axis) - { - } - - /** - * @brief Destroy the Flatten object. - */ - ~Flatten() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 1); - std::vector output_shape(2); - int index = -1; - if (m_axis < 0) { - index = input_shapes[0].size() + m_axis; - } else { - index = m_axis; - } - int size = 1; - for (int i = 0; i < index; i++) { - size *= input_shapes[0][i]; - } - output_shape[0] = size; - - size = 1; - for (int i = index; i < input_shapes[0].size(); i++) { - size *= input_shapes[0][i]; - } - output_shape[1] = size; - std::vector> output_shapes(1, output_shape); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - assert(input->get_size() == output->get_size()); - if (output->get_element_ptr() != input->get_element_ptr()) { - output->assign(input); - } - DL_LOG_LAYER_LATENCY_END(this->name, "Flatten"); - } - - void forward_args(void *args) {} - - /** - * @brief deserialize Flatten module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - int axis = 1; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - fbs_model->get_operation_attribute(node_name, "axis", axis); - - // Create module - op = new Flatten(axis, node_name.c_str(), MODULE_INPLACE_UNCHANGED_BUFFER, quant_type); - return op; - } - - void print() { ESP_LOGI("Flatten", "quant_type: %s, axis: %d.", quant_type_to_string(quant_type), m_axis); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_gemm.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_gemm.hpp deleted file mode 100644 index 16faa3ae..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_gemm.hpp +++ /dev/null @@ -1,180 +0,0 @@ -#pragma once - -#include "dl_base_conv2d.hpp" -#include "dl_base_depthwise_conv2d.hpp" -#include "dl_module_base.hpp" -#include -#include "freertos/FreeRTOS.h" -#include "freertos/task.h" - -namespace dl { -namespace module { - -/** - * @brief Activation(Gemm(input, filter) + bias). - * - */ -class Gemm : public Module { -private: - TensorBase *filter; /**/ - TensorBase *bias; /**/ - activation_type_t activation; /**/ - -public: - /** - * @brief Construct a new Gemm object. - * - * @param filter filter of Gemm. It's shape is [1, 1, in_features, out_features] - * @param bias bias of Gemm, if you don't specify anything, no bias is added - * @param activation activation of Gemm, if you don't specify anything, no activation is applied - * @param name name of module - */ - Gemm(TensorBase *filter, - TensorBase *bias = nullptr, - activation_type_t activation = Linear, - const char *name = nullptr, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, MODULE_NON_INPLACE, quant_type), filter(filter), bias(bias), activation(activation) - { - } - - /** - * @brief Destroy the Gemm object. - * - */ - ~Gemm() - { - if (filter) { - delete filter; - } - - if (bias) { - delete bias; - } - } - - /** - * @brief Calculate the output shape - * - * @param input_shape The shape of inputs - * - * @return output shape - */ - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 1); - assert(filter->shape.size() == 4); - assert(filter->shape[0] == 1); - assert(filter->shape[1] == 1); - assert(input_shapes[0][input_shapes[0].size() - 1] == filter->shape[2]); - - // refer to https://pytorch.org/docs/stable/generated/torch.nn.Linear.html - std::vector output_shape = input_shapes[0]; - output_shape[output_shape.size() - 1] = filter->shape[3]; - std::vector> output_shapes(1, output_shape); - return output_shapes; - } - - void forward_args(void *args) - { - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - base::conv2d(args); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - base::conv2d(args); - } - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - std::vector padding(4, 0); - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - std::vector origin_input_shape = input->get_shape(); - std::vector origin_output_shape = output->get_shape(); - input->set_shape({1, 1, input->get_size() / origin_input_shape.back(), origin_input_shape.back()}); - output->set_shape({1, 1, output->get_size() / origin_output_shape.back(), origin_output_shape.back()}); - - std::vector> m_args = - base::get_conv_operation_args(output, - input, - padding, - this->filter, - 1 /*stride_y*/, - 1 /*stride_x*/, - 1 /*dilation_y*/, - 1 /*dilation_x*/, - 1 /*group*/, - this->bias, - this->activation, - nullptr, - mode); // do not support PReLU and Leaky RelU - int task_size = m_args.size(); - if (task_size == 1) { // single task - forward_args((void *)&m_args[0]); - } else if (task_size == 2) { // multi task, use semaphore to maintain synchronization. - ESP_LOGI("Gemm", "two task..."); - module_forward_dual_core(this, (void *)&m_args[0], (void *)&m_args[1]); - } else { - ESP_LOGE("Gemm", "Only support task size is 1 or 2, currently task size is %d", task_size); - } - input->set_shape(origin_input_shape); - output->set_shape(origin_output_shape); - } - - void forward(std::vector &tensors, runtime_mode_t mode = RUNTIME_MODE_AUTO) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } - DL_LOG_LAYER_LATENCY_END(this->name, "Gemm"); - } - - /** - * @brief deserialize Conv2d module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *gemm_op = nullptr; - - activation_type_t activation_type; - quant_type_t quant_type; - int transA = -1, transB = -1; - fbs_model->get_operation_attribute(node_name, "activation", activation_type); - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - fbs_model->get_operation_attribute(node_name, "transA", transA); - fbs_model->get_operation_attribute(node_name, "transB", transB); - assert(transA == -1 || transA == 0); - assert(transB == -1 || transB == 0); - - // Create module - if (quant_type == QUANT_TYPE_SYMM_8BIT || quant_type == QUANT_TYPE_SYMM_16BIT) { - TensorBase *filter = fbs_model->get_operation_parameter(node_name, 1); - TensorBase *bias = fbs_model->get_operation_parameter(node_name, 2); - if (bias) { - bias->reset_bias_layout(quant_type, false); - } - - gemm_op = new Gemm(filter, bias, activation_type, node_name.c_str(), quant_type); - } - - return gemm_op; - } - - void print() - { - ESP_LOGI("Gemm", - "filter:%s, bias:%s, activation: %s, " - "quant_type: %s.", - shape_to_string(filter->shape).c_str(), - bias == nullptr ? "false" : "true", - activation_type_to_string(activation), - quant_type_to_string(quant_type)); - } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_global_avg_pool2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_global_avg_pool2d.hpp deleted file mode 100644 index d8d16720..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_global_avg_pool2d.hpp +++ /dev/null @@ -1,103 +0,0 @@ -#pragma once - -#include "dl_base_avg_pool2d.hpp" -#include "dl_module_base.hpp" - -namespace dl { -namespace module { -/** - * NOTE: - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -class GlobalAveragePool2D : public Module { -public: - /** - * @brief Construct a new GlobalAveragePool2D object. - * - * @param name name of module - */ - GlobalAveragePool2D(const char *name = NULL, quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, MODULE_NON_INPLACE, quant_type) - { - } - - /** - * @brief Destroy the GlobalAveragePool2D object. - */ - GlobalAveragePool2D() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 1); - assert(input_shapes[0].size() == 4); - int *input_shape = input_shapes[0].data(); - - std::vector output_shape(4, 1); - output_shape[3] = input_shape[3]; - - std::vector> output_shapes(1, output_shape); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } - DL_LOG_LAYER_LATENCY_END(this->name, "GlobalAveragePool2D"); - } - - void forward_args(void *args) - { - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - base::avg_pool2d(args); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - base::avg_pool2d(args); - } - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - - std::vector> m_args = - base::get_pool_args(output, input, {0, 0, 0, 0}, {input->shape[1], input->shape[2]}, 1, 1, mode); - int task_size = m_args.size(); - if (task_size == 1) { // single task - forward_args((void *)&m_args[0]); - } else if (task_size == 2) { // multi task, use semaphore to maintain synchronization. - module_forward_dual_core(this, (void *)&m_args[0], (void *)&m_args[1]); - } else { - ESP_LOGE("GlobalAveragePool2D", "Only support task size is 1 or 2, currently task size is %d", task_size); - } - } - - /** - * @brief deserialize GlobalAveragePool2D module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - - // Create module - if (quant_type == QUANT_TYPE_SYMM_8BIT || quant_type == QUANT_TYPE_SYMM_16BIT) { - op = new GlobalAveragePool2D(node_name.c_str(), quant_type); - } - return op; - } - - void print() { ESP_LOGI("GlobalAveragePool2D", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_hardsigmoid.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_hardsigmoid.hpp deleted file mode 100644 index dcc2b469..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_hardsigmoid.hpp +++ /dev/null @@ -1,111 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" -#include "dl_module_lut.hpp" - -namespace dl { -namespace module { -/** - * NOTE: - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t, implemented by LUT - * - int8_t: stands for operation in int16_t, implemented by LUT - * y = max(0, min(1, alpha * x + beta)), refer to https://onnx.ai/onnx/operators/onnx__HardSigmoid.html - */ -class HardSigmoid : public Module { -private: - float alpha; - float beta; - -public: - /** - * @brief Construct a new HardSigmoid object. - * - * @param name name of module - * @param inplace inplace type. - */ - HardSigmoid(const char *name = NULL, - float alpha = 0.2, - float beta = 0.5, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - this->alpha = alpha; - this->beta = beta; - } - - /** - * @brief Destroy the HardSigmoid object. - */ - ~HardSigmoid() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode = RUNTIME_MODE_AUTO) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } - DL_LOG_LAYER_LATENCY_END(this->name, "HardSigmoid"); - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - T *input_ptr = (T *)input->get_element_ptr(); - T *output_ptr = (T *)output->get_element_ptr(); - - float input_scale = DL_SCALE(input->exponent); - float output_scale = DL_RESCALE(output->exponent); - for (size_t i = 0; i < input->size; i++) { - float temp = input_ptr[i] * input_scale; - temp = DL_MAX(0, DL_MIN(1, this->alpha * temp + this->beta)); - tool::truncate(output_ptr[i], tool::round(temp * output_scale)); - } - } - - void forward_args(void *args) {} - - /** - * @brief deserialize HardSigmoid module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - float alpha = 0.2; - float beta = 0.5; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - fbs_model->get_operation_attribute(node_name, "alpha", alpha); - fbs_model->get_operation_attribute(node_name, "beta", beta); - TensorBase *table = fbs_model->get_operation_lut(node_name); - - // Create module - if (table != NULL) { - op = new LUT(node_name.c_str(), table, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } else { - op = new HardSigmoid(node_name.c_str(), alpha, beta, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } - - return op; - } - - void print() - { - ESP_LOGI("HardSigmoid", "quant_type: %s. alpha:%f, beta:%f", quant_type_to_string(quant_type), alpha, beta); - } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_hardswish.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_hardswish.hpp deleted file mode 100644 index 0117cbe7..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_hardswish.hpp +++ /dev/null @@ -1,97 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" -#include "dl_module_lut.hpp" - -namespace dl { -namespace module { -/** - * NOTE: - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t, implemented by LUT - * - int8_t: stands for operation in int16_t, implemented by LUT - * y = x * max(0, min(1, 0.166667 * x + 0.5)), refer to https://onnx.ai/onnx/operators/onnx__HardSwish.html - */ -class HardSwish : public Module { -public: - /** - * @brief Construct a new HardSwish object. - * - * @param name name of module - * @param inplace inplace type. - */ - HardSwish(const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - } - - /** - * @brief Destroy the HardSwish object. - */ - ~HardSwish() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode = RUNTIME_MODE_AUTO) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } - DL_LOG_LAYER_LATENCY_END(this->name, "HardSwish"); - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - T *input_ptr = (T *)input->get_element_ptr(); - T *output_ptr = (T *)output->get_element_ptr(); - - float input_scale = DL_SCALE(input->exponent); - float output_scale = DL_RESCALE(output->exponent); - for (size_t i = 0; i < input->size; i++) { - float temp = input_ptr[i] * input_scale; - temp = DL_MAX(0, DL_MIN(1, 0.166667 * temp + 0.5)) * temp; - tool::truncate(output_ptr[i], tool::round(temp * output_scale)); - } - } - - void forward_args(void *args) {} - - /** - * @brief deserialize HardSwish module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *table = fbs_model->get_operation_lut(node_name); - - // Create module - if (table != NULL) { - op = new LUT(node_name.c_str(), table, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } else { - op = new HardSwish(node_name.c_str(), MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } - op->print(); - - return op; - } - - void print() { ESP_LOGI("HardSwish", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_leakyrelu.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_leakyrelu.hpp deleted file mode 100644 index 3fc0e44b..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_leakyrelu.hpp +++ /dev/null @@ -1,106 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" -#include "dl_module_lut.hpp" - -namespace dl { -namespace module { -/** - * NOTE: - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t, implemented by LUT - * - int8_t: stands for operation in int16_t, implemented by LUT - * refer to https://onnx.ai/onnx/operators/onnx__LeakyRelu.html - */ -class LeakyRelu : public Module { -private: - float alpha; - -public: - /** - * @brief Construct a new LeakyRelu object. - * - * @param name name of module - * @param inplace inplace type. - */ - LeakyRelu(const char *name = NULL, - float alpha = 0.01, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - this->alpha = alpha; - } - - /** - * @brief Destroy the LeakyRelu object. - */ - ~LeakyRelu() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode = RUNTIME_MODE_AUTO) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } - DL_LOG_LAYER_LATENCY_END(this->name, "LeakyRelu"); - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - T *input_ptr = (T *)input->get_element_ptr(); - T *output_ptr = (T *)output->get_element_ptr(); - - float input_scale = DL_SCALE(input->exponent); - float output_scale = DL_RESCALE(output->exponent); - for (size_t i = 0; i < input->size; i++) { - float temp = input_ptr[i] * input_scale; - if (temp >= 0) { - tool::truncate(output_ptr[i], tool::round(temp * output_scale)); - } else { - tool::truncate(output_ptr[i], tool::round(temp * output_scale * this->alpha)); - } - } - } - - void forward_args(void *args) {} - - /** - * @brief deserialize LeakyRelu module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - float alpha = 0.01; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - fbs_model->get_operation_attribute(node_name, "alpha", alpha); - TensorBase *table = fbs_model->get_operation_lut(node_name); - - // Create module - if (table != NULL) { - op = new LUT(node_name.c_str(), table, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } else { - op = new LeakyRelu(node_name.c_str(), alpha, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } - - return op; - } - - void print() { ESP_LOGI("LeakyRelu", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_log.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_log.hpp deleted file mode 100644 index 2abb0625..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_log.hpp +++ /dev/null @@ -1,103 +0,0 @@ -#pragma once - -#include "dl_math.hpp" -#include "dl_module_base.hpp" -#include "dl_module_lut.hpp" - -namespace dl { -namespace module { -/** - * @brief: Calculates the natural log of the given input tensor, element-wise. - * Supports float, int16_t and int8_t. - */ -class Log : public Module { -public: - /** - * @brief Construct a new Log object. - * - * @param name name of module - * @param inplace inplace type. - */ - Log(const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - } - - /** - * @brief Destroy the Log object. - */ - ~Log() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode = RUNTIME_MODE_AUTO) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_FLOAT32) { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - float *input_ptr = (float *)input->get_element_ptr(); - float *output_ptr = (float *)output->get_element_ptr(); - - for (size_t i = 0; i < input->size; i++) { - output_ptr[i] = logf(input_ptr[i]); - } - } - DL_LOG_LAYER_LATENCY_END(this->name, "Log"); - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - T *input_ptr = (T *)input->get_element_ptr(); - T *output_ptr = (T *)output->get_element_ptr(); - - float input_scale = DL_SCALE(input->exponent); - float output_scale = DL_RESCALE(output->exponent); - for (size_t i = 0; i < input->size; i++) { - float temp = input_ptr[i] * input_scale; - temp = logf(temp); - tool::truncate(output_ptr[i], tool::round(temp * output_scale)); - } - } - - void forward_args(void *args) {} - - /** - * @brief deserialize Log module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *table = fbs_model->get_operation_lut(node_name); - - // Create module - if (table != NULL) { - op = new LUT(node_name.c_str(), table, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } else { - op = new Log(node_name.c_str(), MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } - op->print(); - - return op; - } - - void print() { ESP_LOGI("Log", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_lut.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_lut.hpp deleted file mode 100644 index 96e9916b..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_lut.hpp +++ /dev/null @@ -1,122 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" - -namespace dl { -namespace module { -/** - * NOTE:int16 using linear interpolation + lookup table. - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -class LUT : public Module { -private: - TensorBase *table; /*LUT loop up table*/ - int step; /*LUT loop up table step: only available for int16.*/ -public: - /** - * @brief Construct a new LUT object. - * - * @param name name of module - * @param inplace inplace type. - */ - LUT(const char *name = NULL, - TensorBase *table = NULL, - module_inplace_t inplace = MODULE_INPLACE_CHANGED_BUFFER, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - this->table = table; - this->step = 1; - if (quant_type == QUANT_TYPE_SYMM_16BIT) { - this->step = 65536 / (this->table->get_size() - 1); - } - } - - /** - * @brief Destroy the LUT object. - */ - ~LUT() - { - if (this->table) { - delete this->table; - } - } - - std::vector> get_output_shape(std::vector> &input_shapes) - { - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - assert(output->exponent == this->table->exponent); - - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - int8_t *input_ptr = (int8_t *)input->get_element_ptr(); - int8_t *output_ptr = (int8_t *)output->get_element_ptr(); - int8_t *table_ptr = (int8_t *)(this->table->get_element_ptr()); - for (size_t i = 0; i < input->size; i++) { - output_ptr[i] = table_ptr[input_ptr[i] + 128]; - } - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - int16_t *input_ptr = (int16_t *)input->get_element_ptr(); - int16_t *output_ptr = (int16_t *)output->get_element_ptr(); - int16_t *table_ptr = (int16_t *)(this->table->get_element_ptr()); - - if (this->step == 1) { - for (size_t i = 0; i < input->size; i++) { - output_ptr[i] = table_ptr[input_ptr[i] + 32768]; - } - } else { - for (size_t i = 0; i < input->size; i++) { - int idx = input_ptr[i] + 32768; - int len = idx % this->step; - idx = idx / this->step; - - // linear interpolation - int x = table_ptr[idx]; - int y = table_ptr[idx + 1]; - output_ptr[i] = x + len * (y - x) / this->step; - } - } - } - DL_LOG_LAYER_LATENCY_END(this->name, "LUT"); - } - - void forward_args(void *args) {} - - /** - * @brief deserialize LUT module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *table = fbs_model->get_operation_lut(node_name); - - if (table == NULL) { - ESP_LOGE("LUT", "Table is null!"); - } - - // Create module - if (quant_type == QUANT_TYPE_SYMM_8BIT || quant_type == QUANT_TYPE_SYMM_16BIT) { - op = new LUT(node_name.c_str(), table, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } else { - ESP_LOGE("LUT", "Only support QUANT_TYPE_SYMM_8BIT or QUANT_TYPE_SYMM_16BIT!"); - } - return op; - } - - void print() { ESP_LOGI("LUT", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_mul.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_mul.hpp deleted file mode 100644 index 95403312..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_mul.hpp +++ /dev/null @@ -1,105 +0,0 @@ -#pragma once - -#include "dl_base_mul2d.hpp" -#include "dl_base_shape.hpp" -#include "dl_module_base.hpp" - -namespace dl { -namespace module { -/** - * NOTE: multiplication is element-wise, i.e., output[i,j,k] = input0[i,j,k] * input1[i,j,k] - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -class Mul2D : public Module { -public: - /** - * @brief Construct a new Mul2D object. - * - * @param name name of module - * @param inplace inplace type. - */ - Mul2D(const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - } - - /** - * @brief Destroy the Mul2D object. - */ - ~Mul2D() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 2); - - // support multidirectional broadcasting - std::vector output_shape = base::get_multidirectional_broadcasting_shape(input_shapes[0], input_shapes[1]); - - return std::vector>(1, output_shape); - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } - DL_LOG_LAYER_LATENCY_END(this->name, "Mul2D"); - } - - void forward_args(void *args) - { - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - base::mul2d(args); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - base::mul2d(args); - } - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input0 = tensors[m_inputs_index[0]]; - TensorBase *input1 = tensors[m_inputs_index[1]]; - TensorBase *output = tensors[m_outputs_index[0]]; - - std::vector> m_args = - base::get_arith_operation_args(output, input0, input1, Linear, nullptr, mode); - int task_size = m_args.size(); - if (task_size == 1) { // single task - forward_args((void *)&m_args[0]); - } else if (task_size == 2) { // multi task, use semaphore to maintain synchronization. - module_forward_dual_core(this, (void *)&m_args[0], (void *)&m_args[1]); - } else { - ESP_LOGE("Mul2D", "Only support task size is 1 or 2, currently task size is %d", task_size); - } - } - - /** - * @brief deserialize Mul2D module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - - // Create module - if (quant_type == QUANT_TYPE_SYMM_8BIT || quant_type == QUANT_TYPE_SYMM_16BIT) { - op = new Mul2D(node_name.c_str(), MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } - return op; - } - - void print() { ESP_LOGI("Mul2D", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_prelu.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_prelu.hpp deleted file mode 100644 index 4e9bdd3b..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_prelu.hpp +++ /dev/null @@ -1,112 +0,0 @@ -#pragma once - -#include "dl_base_prelu.hpp" -#include "dl_module_base.hpp" -#include "dl_module_lut.hpp" - -namespace dl { -namespace module { -/** - * NOTE: - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -class PRelu : public Module { -private: - TensorBase *alpha; - -public: - /** - * @brief Construct a new PRelu object. - * - * @param name name of module - * @param alpha learnable param alpha of prelu, slope for neg part. - * @param inplace inplace type. - */ - PRelu(const char *name = NULL, - TensorBase *alpha = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type), alpha(alpha) - { - } - - /** - * @brief Destroy the PRelu object. - */ - ~PRelu() { delete this->alpha; } - - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 1); - assert(input_shapes[0][3] == this->alpha->shape[0]); - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } - DL_LOG_LAYER_LATENCY_END(this->name, "PRelu"); - } - - void forward_args(void *args) - { - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - base::prelu(args); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - base::prelu(args); - } - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - - std::vector> m_args = base::get_activation_args(output, input, PReLU, alpha, mode); - int task_size = m_args.size(); - if (task_size == 1) { // single task - forward_args((void *)&m_args[0]); - } else if (task_size == 2) { // multi task, use semaphore to maintain synchronization. - module_forward_dual_core(this, (void *)&m_args[0], (void *)&m_args[1]); - } else { - ESP_LOGE("PRelu", "Only support task size is 1 or 2, currently task size is %d", task_size); - } - } - - /** - * @brief deserialize PRelu module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *alpha = fbs_model->get_operation_parameter(node_name, 1); - TensorBase *table = fbs_model->get_operation_lut(node_name); - // [c, 1, 1] - assert(alpha->shape.size() == 3); - - // Create module - if (table != NULL) { - op = new LUT(node_name.c_str(), table, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } else if (quant_type == QUANT_TYPE_SYMM_8BIT || quant_type == QUANT_TYPE_SYMM_16BIT) { - op = new PRelu(node_name.c_str(), alpha, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } - return op; - } - - void print() { ESP_LOGI("PRelU", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_relu.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_relu.hpp deleted file mode 100644 index 4e4bc400..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_relu.hpp +++ /dev/null @@ -1,121 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" - -namespace dl { -namespace module { -/** - * NOTE: - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t, implemented by LUT - * - int8_t: stands for operation in int16_t, implemented by LUT - * refer to https://onnx.ai/onnx/operators/onnx__Relu.html - */ -class Relu : public Module { -public: - /** - * @brief Construct a new Relu object. - * - * @param name name of module - * @param inplace inplace type. - */ - Relu(const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - } - - /** - * @brief Destroy the Relu object. - */ - ~Relu() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode = RUNTIME_MODE_AUTO) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_FLOAT32) { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - float *input_ptr = (float *)input->get_element_ptr(); - float *output_ptr = (float *)output->get_element_ptr(); - - for (size_t i = 0; i < input->size; i++) { - if (input_ptr[i] >= 0) { - output_ptr[i] = input_ptr[i]; - } else { - output_ptr[i] = 0; - } - } - } - DL_LOG_LAYER_LATENCY_END(this->name, "Relu"); - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - T *input_ptr = (T *)input->get_element_ptr(); - T *output_ptr = (T *)output->get_element_ptr(); - - if (input->exponent == output->exponent) { - for (size_t i = 0; i < input->size; i++) { - if (input_ptr[i] >= 0) { - output_ptr[i] = input_ptr[i]; - } else { - output_ptr[i] = 0; - } - } - } else { - float input_scale = DL_SCALE(input->exponent); - float output_scale = DL_RESCALE(output->exponent); - for (size_t i = 0; i < input->size; i++) { - float temp = input_ptr[i] * input_scale; - if (temp >= 0) { - tool::truncate(output_ptr[i], tool::round(temp * output_scale)); - } else { - output_ptr[i] = 0; - } - } - } - } - - void forward_args(void *args) {} - - /** - * @brief deserialize Relu module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *table = fbs_model->get_operation_lut(node_name); - - // Create module - if (table != NULL) { - op = new LUT(node_name.c_str(), table, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } else { - op = new Relu(node_name.c_str(), MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } - - return op; - } - - void print() { ESP_LOGI("Relu", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_requantize_linear.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_requantize_linear.hpp deleted file mode 100644 index 734c4f9f..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_requantize_linear.hpp +++ /dev/null @@ -1,64 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" - -namespace dl { -namespace module { - -class RequantizeLinear : public Module { -public: - /** - * @brief Construct a new RequantizeLinear object. - * - * @param name name of module - * @param inplace inplace type. - */ - RequantizeLinear(const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - } - - /** - * @brief Destroy the RequantizeLinear object. - */ - ~RequantizeLinear() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - assert(input->get_size() == output->get_size()); - output->assign(input); - DL_LOG_LAYER_LATENCY_END(this->name, "RequantizeLinear"); - } - - void forward_args(void *args) {} - - /** - * @brief deserialize RequantizeLinear module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - - // Create module - op = new RequantizeLinear(node_name.c_str(), MODULE_INPLACE_CHANGED_BUFFER, quant_type); - return op; - } - - void print() { ESP_LOGI("RequantizeLinear", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_reshape.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_reshape.hpp deleted file mode 100644 index 56f22e79..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_reshape.hpp +++ /dev/null @@ -1,120 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" - -namespace dl { -namespace module { - -// https://onnx.ai/onnx/operators/onnx__Reshape.html -class Reshape : public Module { -private: - TensorBase *m_shape; /**/ - -public: - /** - * @brief Construct a new Reshape object. - * - * @param name name of module - * @param inplace inplace type. - */ - Reshape(TensorBase *shape, - const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type), m_shape(shape) - { - } - - /** - * @brief Destroy the Reshape object. - */ - ~Reshape() - { - if (m_shape) { - delete m_shape; - } - } - - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 1); - - int input_size = 1; - for (int i = 0; i < input_shapes[0].size(); i++) { - assert(input_shapes[0][i] > 0); - input_size *= input_shapes[0][i]; - } - - int64_t *shape_param = static_cast(m_shape->get_element_ptr()); - int negative_index = -1; - int shape_param_size = 1; - for (int i = 0; i < m_shape->get_size(); i++) { - if (negative_index == -1 && shape_param[i] == -1) { - negative_index = i; - } else if (shape_param[i] > 0) { - shape_param_size *= shape_param[i]; - } else { - assert(false); - } - } - - std::vector output(m_shape->get_size()); - if (negative_index == -1) { - assert(shape_param_size == input_size); - for (int i = 0; i < m_shape->get_size(); i++) { - output[i] = static_cast(shape_param[i]); - } - } else { - assert(input_size % shape_param_size == 0); - for (int i = 0; i < m_shape->get_size(); i++) { - if (i == negative_index) { - output[i] = input_size / shape_param_size; - } else { - output[i] = static_cast(shape_param[i]); - } - } - } - std::vector> output_shapes(1, output); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - assert(input->get_size() == output->get_size()); - if (output->get_element_ptr() != input->get_element_ptr()) { - output->assign(input); - } - DL_LOG_LAYER_LATENCY_END(this->name, "Reshape"); - } - - void forward_args(void *args) {} - - /** - * @brief deserialize Reshape module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *shape = fbs_model->get_operation_parameter(node_name, 1); - - // Create module - op = new Reshape(shape, node_name.c_str(), MODULE_INPLACE_UNCHANGED_BUFFER, quant_type); - return op; - } - - void print() - { - ESP_LOGI("Reshape", - "quant_type: %s, shape: %s.", - quant_type_to_string(quant_type), - shape_to_string(m_shape->get_shape()).c_str()); - } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_resize2d.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_resize2d.hpp deleted file mode 100644 index 03d1d8be..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_resize2d.hpp +++ /dev/null @@ -1,123 +0,0 @@ -#pragma once - -#include "dl_base_resize2d.hpp" -#include "dl_module_base.hpp" - -namespace dl { -namespace module { -/** - * NOTE: - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -class Resize2D : public Module { -private: - const resize_mode_t resize_type; /**/ - const float scale_y; /**/ - const float scale_x; /**/ -public: - /** - * @brief Construct a new Resize2D object. - * - * @param name name of module - * @param resize_type one of RESIZE_NEAREST or RESIZE_LINEAR or RESIZE_CUBIC - * @param scale_y scale in height - * @param scale_x scale in width - */ - Resize2D(const char *name = NULL, - const resize_mode_t resize_type = RESIZE_NEAREST, - const float scale_y = 2.f, - const float scale_x = 2.f, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, MODULE_NON_INPLACE, quant_type), resize_type(resize_type), scale_y(scale_y), scale_x(scale_x) - { - } - - /** - * @brief Destroy the Resize2D object. - */ - ~Resize2D() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 1); - assert(input_shapes[0].size() == 4); - int *input_shape = input_shapes[0].data(); - - std::vector output_shape(4); - output_shape[0] = input_shape[0]; - output_shape[1] = (int)(input_shape[1] * this->scale_y); - output_shape[2] = (int)(input_shape[2] * this->scale_x); - output_shape[3] = input_shape[3]; - - std::vector> output_shapes(1, output_shape); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } - DL_LOG_LAYER_LATENCY_END(this->name, "Resize2D"); - } - - void forward_args(void *args) - { - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - base::resize2d(args); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - base::resize2d(args); - } - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - - std::vector> m_args = - base::get_resize_operation_args(output, input, RESIZE_NEAREST, scale_y, scale_x); - int task_size = m_args.size(); - if (task_size == 1) { // single task - forward_args((void *)&m_args[0]); - } else if (task_size == 2) { // multi task, use semaphore to maintain synchronization. - module_forward_dual_core(this, (void *)&m_args[0], (void *)&m_args[1]); - } else { - ESP_LOGE("Resize2D", "Only support task size is 1 or 2, currently task size is %d", task_size); - } - } - - /** - * @brief deserialize Resize2D module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - resize_mode_t resize_mode; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - fbs_model->get_operation_attribute(node_name, "mode", resize_mode); - dl::TensorBase *resize_scales_tensor = fbs_model->get_operation_parameter(node_name, 2); - assert(resize_scales_tensor->shape.size() == 1 && resize_scales_tensor->shape[0] == 4); - float *resize_scales = (float *)resize_scales_tensor->get_element_ptr(); - - // Create module - if (quant_type == QUANT_TYPE_SYMM_8BIT || quant_type == QUANT_TYPE_SYMM_16BIT) { - op = new Resize2D(node_name.c_str(), resize_mode, resize_scales[2], resize_scales[3], quant_type); - } - delete resize_scales_tensor; - return op; - } - - void print() { ESP_LOGI("Resize2D", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_sigmoid.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_sigmoid.hpp deleted file mode 100644 index 50ec2c8f..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_sigmoid.hpp +++ /dev/null @@ -1,106 +0,0 @@ -#pragma once - -#include "dl_math.hpp" -#include "dl_module_base.hpp" -#include "dl_module_lut.hpp" - -namespace dl { -namespace module { -/** - * NOTE: - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -class Sigmoid : public Module { -public: - /** - * @brief Construct a new Sigmoid object. - * - * @param name name of module - * @param inplace inplace type. - */ - Sigmoid(const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - } - - /** - * @brief Destroy the Sigmoid object. - */ - ~Sigmoid() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 1); - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - int8_t *input_ptr = (int8_t *)input->get_element_ptr(); - int8_t *output_ptr = (int8_t *)output->get_element_ptr(); - - float input_scale = DL_SCALE(input->exponent); - float output_scale = DL_RESCALE(output->exponent); - for (size_t i = 0; i < input->size; i++) { - float temp = math::sigmoid((float)input_ptr[i] * input_scale); - tool::truncate(output_ptr[i], tool::round(temp * output_scale)); - } - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - int16_t *input_ptr = (int16_t *)input->get_element_ptr(); - int16_t *output_ptr = (int16_t *)output->get_element_ptr(); - - float input_scale = DL_SCALE(input->exponent); - float output_scale = DL_RESCALE(output->exponent); - for (size_t i = 0; i < input->size; i++) { - float temp = math::sigmoid((float)input_ptr[i] * input_scale); - tool::truncate(output_ptr[i], tool::round(temp * output_scale)); - } - } else if (quant_type == QUANT_TYPE_FLOAT32) { - float *input_ptr = (float *)input->get_element_ptr(); - float *output_ptr = (float *)output->get_element_ptr(); - - for (size_t i = 0; i < input->size; i++) { - output_ptr[i] = math::sigmoid(input_ptr[i]); - } - } - DL_LOG_LAYER_LATENCY_END(this->name, "Sigmoid"); - } - - void forward_args(void *args) {} - - /** - * @brief deserialize Sigmoid module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *table = fbs_model->get_operation_lut(node_name); - - // Create module - if (table != NULL) { - op = new LUT(node_name.c_str(), table, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } else { - op = new Sigmoid(node_name.c_str(), MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } - - return op; - } - - void print() { ESP_LOGI("Sigmoid", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_sqrt.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_sqrt.hpp deleted file mode 100644 index 550ebcdf..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_sqrt.hpp +++ /dev/null @@ -1,103 +0,0 @@ -#pragma once - -#include "dl_math.hpp" -#include "dl_module_base.hpp" -#include "dl_module_lut.hpp" - -namespace dl { -namespace module { -/** - * @brief: Square root takes one input data (Tensor) and produces one output data (Tensor) where the square root is. - * Supports float, int16_t and int8_t. - */ -class Sqrt : public Module { -public: - /** - * @brief Construct a new Sqrt object. - * - * @param name name of module - * @param inplace inplace type. - */ - Sqrt(const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - } - - /** - * @brief Destroy the Sqrt object. - */ - ~Sqrt() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode = RUNTIME_MODE_AUTO) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - forward_template(tensors, mode); - } else if (quant_type == QUANT_TYPE_FLOAT32) { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - float *input_ptr = (float *)input->get_element_ptr(); - float *output_ptr = (float *)output->get_element_ptr(); - - for (size_t i = 0; i < input->size; i++) { - output_ptr[i] = sqrtf(input_ptr[i]); - } - } - DL_LOG_LAYER_LATENCY_END(this->name, "Sqrt"); - } - - template - void forward_template(std::vector &tensors, runtime_mode_t mode) - { - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - T *input_ptr = (T *)input->get_element_ptr(); - T *output_ptr = (T *)output->get_element_ptr(); - - float input_scale = DL_SCALE(input->exponent); - float output_scale = DL_RESCALE(output->exponent); - for (size_t i = 0; i < input->size; i++) { - float temp = input_ptr[i] * input_scale; - temp = sqrtf(temp); - tool::truncate(output_ptr[i], tool::round(temp * output_scale)); - } - } - - void forward_args(void *args) {} - - /** - * @brief deserialize Sqrt module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *table = fbs_model->get_operation_lut(node_name); - - // Create module - if (table != NULL) { - op = new LUT(node_name.c_str(), table, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } else { - op = new Sqrt(node_name.c_str(), MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } - op->print(); - - return op; - } - - void print() { ESP_LOGI("Sqrt", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_squeeze.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_squeeze.hpp deleted file mode 100644 index e9b74ee9..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_squeeze.hpp +++ /dev/null @@ -1,118 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" - -namespace dl { -namespace module { - -// https://onnx.ai/onnx/operators/onnx__Squeeze.html -class Squeeze : public Module { -private: - TensorBase *m_axes; /**/ - -public: - /** - * @brief Construct a new Squeeze object. - * - * @param name name of module - * @param inplace inplace type. - */ - Squeeze(TensorBase *axes, - const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type), m_axes(axes) - { - } - - /** - * @brief Destroy the Squeeze object. - */ - ~Squeeze() - { - if (m_axes) { - delete m_axes; - } - } - - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 1); - std::vector input_shape = input_shapes[0]; - std::vector output_shape; - if (m_axes == nullptr) { - for (int i = 0; i < input_shape.size(); i++) { - if (input_shape[i] != 1) - output_shape.push_back(input_shape[i]); - } - } else { - int64_t *axes_param = static_cast(m_axes->get_element_ptr()); - - for (int i = 0; i < m_axes->get_size(); i++) { - if (axes_param[i] < 0) { - axes_param[i] += input_shape.size(); - } - } - - for (int i = 0; i < input_shape.size(); i++) { - if (input_shape[i] != 1) - output_shape.push_back(input_shape[i]); - else { - bool in = false; - for (int j = 0; j < m_axes->get_size(); j++) { - if (axes_param[j] == i) { - in = true; - break; - } - } - if (!in) { - output_shape.push_back(input_shape[i]); - } - } - } - } - - std::vector> output_shapes(1, output_shape); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - assert(input->get_size() == output->get_size()); - if (output->get_element_ptr() != input->get_element_ptr()) { - output->assign(input); - } - DL_LOG_LAYER_LATENCY_END(this->name, "Squeeze"); - } - - void forward_args(void *args) {} - - /** - * @brief deserialize Squeeze module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *axes = fbs_model->get_operation_parameter(node_name, 1); - - // Create module - op = new Squeeze(axes, node_name.c_str(), MODULE_INPLACE_UNCHANGED_BUFFER, quant_type); - return op; - } - - void print() - { - ESP_LOGI("Squeeze", - "quant_type: %s, axes: %s.", - quant_type_to_string(quant_type), - shape_to_string(m_axes->get_shape()).c_str()); - } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_tanh.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_tanh.hpp deleted file mode 100644 index 6349fb83..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_tanh.hpp +++ /dev/null @@ -1,105 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" -#include "dl_module_lut.hpp" - -namespace dl { -namespace module { -/** - * NOTE: - * - * @tparam feature_t supports float32, int16_t and int8_t, - * - float32 : stands for operation in float32 - * - int16_t: stands for operation in int16_t, implemented by LUT - * - int8_t: stands for operation in int16_t, implemented by LUT - */ -class Tanh : public Module { -public: - /** - * @brief Construct a new Tanh object. - * - * @param name name of module - * @param inplace inplace type. - */ - Tanh(const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type) - { - } - - /** - * @brief Destroy the Tanh object. - */ - ~Tanh() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - std::vector> output_shapes(1, input_shapes[0]); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - - if (quant_type == QUANT_TYPE_SYMM_8BIT) { - int8_t *input_ptr = (int8_t *)input->get_element_ptr(); - int8_t *output_ptr = (int8_t *)output->get_element_ptr(); - - float input_scale = DL_SCALE(input->exponent); - float output_scale = DL_RESCALE(output->exponent); - for (size_t i = 0; i < input->size; i++) { - float temp = math::tanh((float)input_ptr[i] * input_scale); - tool::truncate(output_ptr[i], tool::round(temp * output_scale)); - } - } else if (quant_type == QUANT_TYPE_SYMM_16BIT) { - int16_t *input_ptr = (int16_t *)input->get_element_ptr(); - int16_t *output_ptr = (int16_t *)output->get_element_ptr(); - - float input_scale = DL_SCALE(input->exponent); - float output_scale = DL_RESCALE(output->exponent); - for (size_t i = 0; i < input->size; i++) { - float temp = math::tanh((float)input_ptr[i] * input_scale); - tool::truncate(output_ptr[i], tool::round(temp * output_scale)); - } - } else if (quant_type == QUANT_TYPE_FLOAT32) { - float *input_ptr = (float *)input->get_element_ptr(); - float *output_ptr = (float *)output->get_element_ptr(); - - for (size_t i = 0; i < input->size; i++) { - output_ptr[i] = math::tanh(input_ptr[i]); - } - } - DL_LOG_LAYER_LATENCY_END(this->name, "Tanh"); - } - - void forward_args(void *args) {} - - /** - * @brief deserialize Tanh module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *table = fbs_model->get_operation_lut(node_name); - - // Create module - if (table != NULL) { - op = new LUT(node_name.c_str(), table, MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } else { - op = new Tanh(node_name.c_str(), MODULE_INPLACE_CHANGED_BUFFER, quant_type); - } - - return op; - } - - void print() { ESP_LOGI("Tanh", "quant_type: %s.", quant_type_to_string(quant_type)); } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_transpose.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_transpose.hpp deleted file mode 100644 index 4c0e5dcd..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_transpose.hpp +++ /dev/null @@ -1,87 +0,0 @@ -#pragma once - -#include "dl_base_mul2d.hpp" -#include "dl_module_base.hpp" - -namespace dl { -namespace module { -/** - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -class Transpose : public Module { -private: - std::vector m_perm; /**/ - -public: - /** - * @brief Construct a new Transpose object. - * - * @param name name of module - * @param inplace inplace type. - */ - Transpose(const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE, - std::vector perm = {}) : - Module(name, inplace, quant_type), m_perm(perm) - { - } - - /** - * @brief Destroy the Transpose object. - */ - ~Transpose() {} - - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 1); - assert(input_shapes[0].size() == m_perm.size() || m_perm.size() == 0); - - std::vector output_shape; - - for (int i = 0; i < input_shapes[0].size(); i++) { - output_shape.push_back(input_shapes[0][m_perm[i]]); - } - - std::vector> output_shapes(1, output_shape); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - output->transpose(input, m_perm); - DL_LOG_LAYER_LATENCY_END(this->name, "Transpose"); - } - - void forward_args(void *args) {} - - /** - * @brief deserialize Transpose module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - std::vector perm; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - fbs_model->get_operation_attribute(node_name, "perm", perm); - - // Create module - op = new Transpose(node_name.c_str(), MODULE_NON_INPLACE, quant_type, perm); - return op; - } - - void print() - { - ESP_LOGI( - "Transpose", "quant_type: %s. perm: %s", quant_type_to_string(quant_type), shape_to_string(m_perm).c_str()); - } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_unsqueeze.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_unsqueeze.hpp deleted file mode 100644 index ec212f66..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/include/dl_module_unsqueeze.hpp +++ /dev/null @@ -1,111 +0,0 @@ -#pragma once - -#include "dl_module_base.hpp" - -namespace dl { -namespace module { - -// https://onnx.ai/onnx/operators/onnx__Unsqueeze.html -class Unsqueeze : public Module { -private: - TensorBase *m_axes; /**/ - -public: - /** - * @brief Construct a new Unsqueeze object. - * - * @param name name of module - * @param inplace inplace type. - */ - Unsqueeze(TensorBase *axes, - const char *name = NULL, - module_inplace_t inplace = MODULE_NON_INPLACE, - quant_type_t quant_type = QUANT_TYPE_NONE) : - Module(name, inplace, quant_type), m_axes(axes) - { - } - - /** - * @brief Destroy the Unsqueeze object. - */ - ~Unsqueeze() - { - if (m_axes) { - delete m_axes; - } - } - - std::vector> get_output_shape(std::vector> &input_shapes) - { - assert(input_shapes.size() == 1); - std::vector input_shape = input_shapes[0]; - std::vector output_shape; - - int64_t *axes_param = static_cast(m_axes->get_element_ptr()); - for (int i = 0; i < m_axes->get_size(); i++) { - if (axes_param[i] < 0) { - axes_param[i] += input_shape.size() + 1; - } - } - - int count = 0; - for (int i = 0; i < input_shape.size() + m_axes->get_size(); i++) { - bool in = false; - for (int j = 0; j < m_axes->get_size(); j++) { - if (axes_param[j] == i) { - in = true; - break; - } - } - if (in) { - output_shape.push_back(1); - count++; - } else { - output_shape.push_back(input_shape[i - count]); - } - } - - std::vector> output_shapes(1, output_shape); - return output_shapes; - } - - void forward(std::vector &tensors, runtime_mode_t mode) - { - DL_LOG_LAYER_LATENCY_INIT(); - DL_LOG_LAYER_LATENCY_START(); - TensorBase *input = tensors[m_inputs_index[0]]; - TensorBase *output = tensors[m_outputs_index[0]]; - assert(input->get_size() == output->get_size()); - if (output->get_element_ptr() != input->get_element_ptr()) { - output->assign(input); - } - DL_LOG_LAYER_LATENCY_END(this->name, "Unsqueeze"); - } - - void forward_args(void *args) {} - - /** - * @brief deserialize Unsqueeze module instance by node serialization information - */ - static Module *deserialize(fbs::FbsModel *fbs_model, std::string node_name) - { - Module *op = nullptr; - quant_type_t quant_type; - fbs_model->get_operation_attribute(node_name, "quant_type", quant_type); - TensorBase *axes = fbs_model->get_operation_parameter(node_name, 1); - - // Create module - op = new Unsqueeze(axes, node_name.c_str(), MODULE_INPLACE_UNCHANGED_BUFFER, quant_type); - return op; - } - - void print() - { - ESP_LOGI("Unsqueeze", - "quant_type: %s, axes: %s.", - quant_type_to_string(quant_type), - shape_to_string(m_axes->get_shape()).c_str()); - } -}; -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/src/dl_module_base.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/src/dl_module_base.cpp deleted file mode 100644 index 2821f7bc..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/module/src/dl_module_base.cpp +++ /dev/null @@ -1,27 +0,0 @@ -#include "dl_module_base.hpp" -#include - -using namespace dl; - -namespace dl { -namespace module { -Module::Module(const char *name, module_inplace_t inplace, quant_type_t quant_type) : - inplace(inplace), quant_type(quant_type) -{ - if (name) { - int length = strlen(name) + 1; - this->name = (char *)malloc(sizeof(char) * length); - memcpy(this->name, name, length); - } else { - this->name = NULL; - } -} - -Module::~Module() -{ - if (this->name) { - free((void *)this->name); - } -} -} // namespace module -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/CMakeLists.txt b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/CMakeLists.txt deleted file mode 100644 index 05c77e67..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/CMakeLists.txt +++ /dev/null @@ -1,28 +0,0 @@ -# set(srcs - -# test_xxx.cpp -# ) # for debug - -set(src_dirs - -. -) - -set(include_dirs - -. -) - -set(requires - -unity -dl -) - -idf_component_register( - SRCS ${srcs} - SRC_DIRS ${src_dirs} - INCLUDE_DIRS ${include_dirs} - REQUIRES ${requires} -) - diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/README.md b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/README.md deleted file mode 100644 index f56b35da..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/README.md +++ /dev/null @@ -1,90 +0,0 @@ -# TestCase.py - -> This python code can help to generate some TestCases. - - - -## Arguments Description - -testcase.py has several optional arguments: - -| arguments | candidate value | Must | Note | -| -------------- | ------------------------------------------------ | :--: | ------------------------------------------------------------ | -| --target_chip | esp32, esp32s2, esp32c3, esp32s3 | Yes | accept one value only. | -| --operation | conv2d, depthwise_conv2d, max_pool2d, avg_pool2d, global_max_pool2d, global_avg_pool2d | Yes | accept one value only. | -| --feature_type | s16, s8 | Yes | accept one value only. | -| --filter_shape | "(H, W)", "(H, W, C, N)" | No | empty: all dimensions are random; "(H, W)": C and N are random; "(H, W, C, N)": as input | -| --input_shape | "(H, W)" | No | empty: H and W are random; "(H, W)": C is according to filter_shape | -| --stride | "(1, Y, X, 1)" | No | generated randomly if not setted. | -| --dilation | "(Y, X)" | No | generated randomly if not setted. | -| --activation | None, "ReLU", "LeakyReLU", "PReLU" | No | accept tuple only, e.g., "(None, \"ReLU\")". Test on all supported activation if not setted. | -| --step | integer > 0 | No | As the size of Flash and PSRAM are limited, only 'step' number of TestCases are generated each time. Default = 20. | -| --total | integer > 0 | No | The 'total' number of TestCases to be generated. Default = 100. | -| --quant | 0 or 1 | No | 0: per-layer quantization for int16 and int8 operations, 1: per-channel quantization for int8 conv2d and depthwise_conv2d | - -Also, you can run `python testcase.py` or `python testcase.py -h` to get help menu. - -```bash -optional arguments: - -h, --help show this help message and exit - --target_chip TARGET_CHIP - ('esp32', 'esp32s2', 'esp32c3', 'esp32s3') - --operation OPERATION - conv2d, depthwise_conv2d - --feature_type FEATURE_TYPE - s16, s8 - --filter_shape FILTER_SHAPE - "(H, W, C, N)" - --stride STRIDE "(1, y, x, 1)" - --dilation DILATION "(y, x)" - --activation ACTIVATION - "(None, \"ReLU\", \"LeakyReLU\", \"PReLU\")" - --step STEP Wait for every this number of testcases - --total TOTAL The total of testcases - --quant QUANTIZATION 0 or 1 -``` - - - -## How to use - -1. Open a terminal. - -2. Go to the root of ./component/dl/test/testcase.py. - -3. Run generator code: - - ```bash - python testcase.py --target_chip esp32 --operation conv2d --feature_type s16 --filter_shape "(3,3,2,8)" --activation "(None,)" --step 10 --total 100 - ``` - - Get informant as below. Now, 10 TestCases were generated for testing. - - ```bash - >>> 10 TestCases were generated - Input a number to continue or "exit" to exit: - ``` - - > After this 10 TestCases were tested, you can input a number to generate next certain of TestCase. - > - > DO NOT terminate the program until the testing finished. Once the program is terminated, generated TestCase files will be removed. - > - > Input "exit" to exit generator code. DO NOT use "ctrl + c". - -4. Open a new terminal. - -5. Go the root of ./test - -6. Run test project code: - - ```bash - idf.py -p /dev/ttyUSB0 -b 921600 flash monitor - ``` - - Test results are printed on terminal like this: - - ![](./test_result.png) - - -## Reference -[Unit Test](https://github.com/espressif/esp-idf/tree/master/examples/system/unit_test) \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_add.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_add.cpp deleted file mode 100644 index 084db404..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_add.cpp +++ /dev/null @@ -1,755 +0,0 @@ -#include "test_data.hpp" - -#include "dl_base.hpp" -#include "dl_define.hpp" -#include "dl_nn_add2d.hpp" -#include "dl_tool.hpp" - -#include "unity.h" -#include - -using namespace dl; -using namespace nn; -using namespace tool; -using namespace base; -using namespace std; - -template -void add2d_c(Tensor &output, - Tensor &input0, - Tensor &input1, - const Activation *const activation) -{ - int height = input0.shape[0]; // inputs and output are the same shape - int width = input0.shape[1]; - int channel = input0.shape[2]; - - feature_t *input0_element = input0.get_element_ptr(); - int input0_y_offset = input0.shape[1] * input0.shape[2]; - int input0_x_offset = input0.shape[2]; - - feature_t *input1_element = input1.get_element_ptr(); - int input1_y_offset = input1.shape[1] * input1.shape[2]; - int input1_x_offset = input1.shape[2]; - - feature_t *output_element = output.get_element_ptr(); // output - int output_y_offset = output.shape[1] * output.shape[2]; - int output_x_offset = output.shape[2]; - - int buffer = 0; - int max_input_exponent = DL_MAX(input0.exponent, input1.exponent); - int input0_shift = max_input_exponent - input0.exponent; - int input1_shift = max_input_exponent - input1.exponent; - int output_scale = 1; - int output_shift = output.exponent - max_input_exponent; - if (output_shift < 0) { - output_scale = 1 << (-output_shift); - output_shift = 0; - } - - activation_type_t activation_type = activation ? activation->type : Linear; - feature_t activation_alpha; - int activation_shift; - const feature_t *activation_alpha_ptr; - - switch (activation_type) { - case ReLU: - activation_alpha = 0; - activation_shift = 0; - activation_alpha_ptr = NULL; - break; - case LeakyReLU: - activation_alpha = activation->element[0]; - activation_shift = -activation->exponent; - activation_alpha_ptr = NULL; - break; - case PReLU: - activation_alpha = 0; - activation_alpha_ptr = activation->element; - activation_shift = -activation->exponent; - break; - default: - activation_alpha = 0; - activation_alpha_ptr = NULL; - activation_shift = -1; - break; - } - - for (size_t output_y = 0; output_y < height; output_y++) { - feature_t *input0_11c = input0_element; - feature_t *input1_11c = input1_element; - feature_t *output_11c = output_element; - - for (size_t output_x = 0; output_x < width; output_x++) { - for (size_t output_c = 0; output_c < channel; output_c++) { - buffer = (int)(DL_RIGHT_SHIFT(input0_11c[output_c], input0_shift)) + - (int)(DL_RIGHT_SHIFT(input1_11c[output_c], input1_shift)); - buffer = DL_RIGHT_SHIFT(buffer * output_scale, output_shift); - tool::truncate(output_11c[output_c], buffer); - if (activation_type == ReLU) { - output_11c[output_c] = DL_MAX(0, output_11c[output_c]); - } else if (activation_type == LeakyReLU) { - if (output_11c[output_c] < 0) { - buffer = DL_RIGHT_SHIFT((output_11c[output_c] * activation_alpha), activation_shift); - tool::truncate(output_11c[output_c], buffer); - } - } else if (activation_type == PReLU) { - if (output_11c[output_c] < 0) { - buffer = - DL_RIGHT_SHIFT((output_11c[output_c] * activation_alpha_ptr[output_c]), activation_shift); - tool::truncate(output_11c[output_c], buffer); - } - } - } - input0_11c += input0_x_offset; - input1_11c += input1_x_offset; - output_11c += output_x_offset; - } - - input0_element += input0_y_offset; - input1_element += input1_y_offset; - output_element += output_y_offset; - } -} - -template -Tensor add2d_c(const int output_exponent, - Tensor &input0, - Tensor &input1, - const Activation *activation, - const std::vector &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -{ - assert(input0.is_same_shape(input1)); - - Tensor output; - output.set_exponent(output_exponent).set_shape(input0.shape).malloc_element(); - add2d_c(output, input0, input1, activation); - - return output; -} - -bool test_add_s8(int exponent0, - int exponent1, - int exponent_out, - int offset0, - int offset1, - int height, - int width, - int channel, - int activation_type, - bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent0) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1; - input1.set_element((int8_t *)&input1_element[offset1]) - .set_exponent(exponent1) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - if (activation_type == 0) { - Tensor output_c = add2d_c(exponent_out, input0, input1, (Activation *)NULL); - latency.start(); - Tensor output = add2d( - exponent_out, input0, input1, (Activation *)NULL); //(Activation *)NULL - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 1) { // relu - Tensor output_c = add2d_c(exponent_out, input0, input1, &layer_activation_relu); - latency.start(); - Tensor output = add2d(exponent_out, input0, input1, &layer_activation_relu); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 2) { // leakyrelu - Tensor output_c = add2d_c(exponent_out, input0, input1, &layer_activation_lrelu); - latency.start(); - Tensor output = add2d(exponent_out, input0, input1, &layer_activation_lrelu); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 3) { // prelu - Tensor output_c = add2d_c(exponent_out, input0, input1, &layer_activation_prelu); - latency.start(); - Tensor output = add2d(exponent_out, input0, input1, &layer_activation_prelu); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } - return false; - } else { - Tensor input0_tmp; - input0_tmp.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent0) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1_tmp; - input1_tmp.set_element((int8_t *)&input1_element[offset1]) - .set_exponent(exponent1) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0; - input0 = input0_tmp; - input0.set_auto_free(true); - - Tensor input1; - input1 = input1_tmp; - input1.set_auto_free(true); - - // output; - if (activation_type == 0) { - Tensor output_c = add2d_c(exponent_out, input0, input1, (Activation *)NULL); - latency.start(); - add2d(exponent_out, input0, input1, (Activation *)NULL); //(Activation *)NULL - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 1) { // relu - Tensor output_c = add2d_c(exponent_out, input0, input1, &layer_activation_relu); - latency.start(); - add2d(exponent_out, input0, input1, &layer_activation_relu); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 2) { // leakyrelu - Tensor output_c = add2d_c(exponent_out, input0, input1, &layer_activation_lrelu); - latency.start(); - add2d(exponent_out, input0, input1, &layer_activation_lrelu); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 3) { // prelu - Tensor output_c = add2d_c(exponent_out, input0, input1, &layer_activation_prelu); - latency.start(); - add2d(exponent_out, input0, input1, &layer_activation_prelu); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } - return false; - } -} - -bool test_add_s16(int exponent0, - int exponent1, - int exponent_out, - int offset0, - int offset1, - int height, - int width, - int channel, - int activation_type, - bool inplace) -{ - // output; - if (!inplace) { - Tensor input0; - input0.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent0) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1; - input1.set_element((int16_t *)&input1_element_s16[offset1]) - .set_exponent(exponent1) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - if (activation_type == 0) { - Tensor output_c = add2d_c(exponent_out, input0, input1, (Activation *)NULL); - latency.start(); - Tensor output = - add2d(exponent_out, input0, input1, (Activation *)NULL); //(Activation *)NULL - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 1, false); - } else if (activation_type == 1) { // relu - Tensor output_c = add2d_c(exponent_out, input0, input1, &layer_activation_relu_s16); - latency.start(); - Tensor output = add2d(exponent_out, input0, input1, &layer_activation_relu_s16); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 1, false); - } else if (activation_type == 2) { // leakyrelu - Tensor output_c = add2d_c(exponent_out, input0, input1, &layer_activation_lrelu_s16); - latency.start(); - Tensor output = add2d(exponent_out, input0, input1, &layer_activation_lrelu_s16); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 1, false); - } else if (activation_type == 3) { // prelu - Tensor output_c = add2d_c(exponent_out, input0, input1, &layer_activation_prelu_s16); - latency.start(); - Tensor output = add2d(exponent_out, input0, input1, &layer_activation_prelu_s16); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 1, false); - } - return false; - } else { - Tensor input0_tmp; - input0_tmp.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent0) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1_tmp; - input1_tmp.set_element((int16_t *)&input1_element_s16[offset1]) - .set_exponent(exponent1) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - Tensor input1(input1_tmp, true); - input1.set_auto_free(true); - - if (activation_type == 0) { - Tensor output_c = add2d_c(exponent_out, input0, input1, (Activation *)NULL); - latency.start(); - add2d( - exponent_out, input0, input1, (Activation *)NULL); //(Activation *)NULL - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 1, false); - } else if (activation_type == 1) { // relu - Tensor output_c = add2d_c(exponent_out, input0, input1, &layer_activation_relu_s16); - latency.start(); - add2d(exponent_out, input0, input1, &layer_activation_relu_s16); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 1, false); - } else if (activation_type == 2) { // leakyrelu - Tensor output_c = add2d_c(exponent_out, input0, input1, &layer_activation_lrelu_s16); - latency.start(); - add2d(exponent_out, input0, input1, &layer_activation_lrelu_s16); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 1, false); - } else if (activation_type == 3) { // prelu - Tensor output_c = add2d_c(exponent_out, input0, input1, &layer_activation_prelu_s16); - latency.start(); - add2d(exponent_out, input0, input1, &layer_activation_prelu_s16); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 1, false); - } - return false; - } -} - -//------------------------------------------------------------------no -// inplace------------------------------------------------------------------ -// c = 6, lrelu -TEST_CASE("test no inplace start", "[add]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -TEST_CASE("test_no_scale_s8, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -8, 4, 7, 3, 3, 6, 2, false)); -} - -TEST_CASE("test_scale_input_only_s8, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -10, -8, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_output_only_s8, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -10, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_shift_output_only_s8, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -7, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input0_output_shift_s8, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-10, -8, -7, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input1_output_scale_s8, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -10, -9, 4, 7, 5, 7, 6, 2, false)); -} - -// c = 16, prelu -TEST_CASE("test_no_scale_s8, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -8, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input_only_s8, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -10, -8, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_output_only_s8, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -10, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_shift_output_only_s8, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -7, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input0_output_shift_s8, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-10, -8, -7, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input1_output_scale_s8, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -10, -9, 0, 0, 5, 7, 16, 3, false)); -} - -// c = 35, relu -TEST_CASE("test_no_scale_s8, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -8, 0, 0, 5, 7, 35, 1, false)); -} - -TEST_CASE("test_scale_input_only_s8, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -10, -8, 0, 0, 5, 7, 35, 1, false)); -} - -TEST_CASE("test_scale_output_only_s8, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -10, 0, 0, 5, 7, 35, 1, false)); -} - -TEST_CASE("test_shift_output_only_s8, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -7, 0, 0, 5, 7, 35, 1, false)); -} - -TEST_CASE("test_scale_input0_output_shift_s8, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s8(-10, -8, -7, 0, 0, 5, 7, 35, 1, false)); -} - -TEST_CASE("test_scale_input1_output_scale_s8, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -10, -9, 0, 0, 5, 7, 35, 1, false)); -} - -// c = 6, lrelu -TEST_CASE("test_no_scale_s16, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -16, 0, 0, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input_only_s16, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -18, -16, 0, 0, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_output_only_s16, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -18, 0, 0, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_shift_output_only_s16, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -15, 0, 0, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input0_output_shift_s16, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-18, -16, -15, 0, 0, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input1_output_scale_s16, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -18, -17, 0, 0, 5, 7, 6, 2, false)); -} - -// c = 16, prelu -TEST_CASE("test_no_scale_s16, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -16, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input_only_s16, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -18, -16, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_output_only_s16, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -18, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_shift_output_only_s16, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -15, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input0_output_shift_s16, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-18, -16, -15, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input1_output_scale_s16, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -18, -17, 0, 0, 5, 7, 16, 3, false)); -} - -// c = 35, relu -TEST_CASE("test_no_scale_s16, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -16, 0, 0, 5, 7, 35, 1, false)); -} - -TEST_CASE("test_scale_input_only_s16, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -18, -16, 0, 0, 5, 7, 35, 1, false)); -} - -TEST_CASE("test_scale_output_only_s16, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -18, 0, 0, 5, 7, 35, 1, false)); -} - -TEST_CASE("test_shift_output_only_s16, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -15, 0, 0, 5, 7, 35, 1, false)); -} - -TEST_CASE("test_scale_input0_output_shift_s16, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s16(-18, -16, -15, 0, 0, 5, 7, 35, 1, false)); -} - -TEST_CASE("test_scale_input1_output_scale_s16, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -18, -17, 0, 0, 5, 7, 35, 1, false)); -} - -//------------------------------------------------------------------inplace------------------------------------------------------------------ -// c = 6, lrelu -TEST_CASE("test inplace start", "[add]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -TEST_CASE("test_no_scale_s8, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -8, 4, 7, 3, 3, 6, 2, true)); -} - -TEST_CASE("test_scale_input_only_s8, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -10, -8, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_output_only_s8, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -10, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_shift_output_only_s8, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -7, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input0_output_shift_s8, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-10, -8, -7, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input1_output_scale_s8, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -10, -9, 4, 7, 5, 7, 6, 2, true)); -} - -// c = 16, prelu -TEST_CASE("test_no_scale_s8, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -8, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input_only_s8, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -10, -8, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_output_only_s8, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -10, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_shift_output_only_s8, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -7, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input0_output_shift_s8, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-10, -8, -7, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input1_output_scale_s8, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -10, -9, 0, 0, 5, 7, 16, 3, true)); -} - -// c = 35, relu -TEST_CASE("test_no_scale_s8, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -8, 0, 0, 5, 7, 35, 1, true)); -} - -TEST_CASE("test_scale_input_only_s8, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -10, -8, 0, 0, 5, 7, 35, 1, true)); -} - -TEST_CASE("test_scale_output_only_s8, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -10, 0, 0, 5, 7, 35, 1, true)); -} - -TEST_CASE("test_shift_output_only_s8, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -8, -7, 0, 0, 5, 7, 35, 1, true)); -} - -TEST_CASE("test_scale_input0_output_shift_s8, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s8(-10, -8, -7, 0, 0, 5, 7, 35, 1, true)); -} - -TEST_CASE("test_scale_input1_output_scale_s8, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s8(-8, -10, -9, 0, 0, 5, 7, 35, 1, true)); -} - -// c = 6, lrelu -TEST_CASE("test_no_scale_s16, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -16, 0, 0, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input_only_s16, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -18, -16, 0, 0, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_output_only_s16, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -18, 0, 0, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_shift_output_only_s16, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -15, 0, 0, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input0_output_shift_s16, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-18, -16, -15, 0, 0, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input1_output_scale_s16, c=6, lrelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -18, -17, 0, 0, 5, 7, 6, 2, true)); -} - -// c = 16, prelu -TEST_CASE("test_no_scale_s16, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -16, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input_only_s16, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -18, -16, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_output_only_s16, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -18, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_shift_output_only_s16, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -15, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input0_output_shift_s16, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-18, -16, -15, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input1_output_scale_s16, c=16, prelu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -18, -17, 0, 0, 5, 7, 16, 3, true)); -} - -// c = 35, relu -TEST_CASE("test_no_scale_s16, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -16, 0, 0, 5, 7, 35, 1, true)); -} - -TEST_CASE("test_scale_input_only_s16, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -18, -16, 0, 0, 5, 7, 35, 1, true)); -} - -TEST_CASE("test_scale_output_only_s16, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -18, 0, 0, 5, 7, 35, 1, true)); -} - -TEST_CASE("test_shift_output_only_s16, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -16, -15, 0, 0, 5, 7, 35, 1, true)); -} - -TEST_CASE("test_scale_input0_output_shift_s16, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s16(-18, -16, -15, 0, 0, 5, 7, 35, 1, true)); -} - -TEST_CASE("test_scale_input1_output_scale_s16, c=35, relu", "[add]") -{ - TEST_ASSERT(test_add_s16(-16, -18, -17, 0, 0, 5, 7, 35, 1, true)); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_data.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_data.hpp deleted file mode 100644 index 519c5052..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_data.hpp +++ /dev/null @@ -1,4126 +0,0 @@ -#pragma once - -#include "dl_constant.hpp" -#include -using namespace dl; - -const static __attribute__((aligned(16))) int8_t input0_element[] = { - // 16, 15, 62 - 71, -6, -78, -62, -119, -83, -55, -103, 57, -36, -113, 67, -53, 82, -39, -3, 64, 66, -60, - 67, -86, 105, 110, -122, -23, -115, -31, 39, -19, 22, -112, 124, -48, -35, -118, -6, -102, 115, - 40, 88, 8, 119, 72, -13, -70, 16, 14, 107, -13, -11, -123, 53, -22, 2, 78, 84, 63, - -50, 63, 108, 105, -15, 90, -9, 109, 57, 34, 3, -54, -99, -99, -30, 14, 30, 19, -120, - 80, 51, 59, 40, -14, -115, 45, 61, 95, 53, -73, 7, -127, -113, -124, -10, -29, 56, -10, - 97, -1, -75, 49, 46, -22, 68, 68, 75, -60, -25, 90, -92, -118, 45, -119, 15, -54, -76, - -76, 122, 6, -58, -67, 25, -82, -111, 105, -117, -125, 57, -86, 82, 64, 74, 20, -11, 83, - 26, -80, -110, 6, 71, 117, 77, 108, -93, 120, 31, -123, 110, -123, -89, -104, -89, -5, 31, - 64, -63, -79, -66, 37, 92, -25, 101, 100, 76, -16, -68, -89, -96, 14, 42, 102, 118, 11, - 15, 59, -19, -9, 19, -82, -9, -115, 79, -85, 54, -125, 58, 121, -107, -92, 106, -30, 114, - 91, 117, -45, 26, -2, 69, 41, 2, -21, 118, 61, -80, 30, 101, 49, -49, 54, -97, -61, - 78, -116, 69, 93, -87, 118, -126, 83, 3, 53, -21, 11, 42, 115, -98, -107, 126, -41, 93, - 89, -63, 28, 105, 97, -65, 45, 41, 41, 63, 101, 105, -1, 109, 21, -24, -74, 3, 46, - 81, 60, 84, 98, 106, 81, -56, 101, -88, 94, 52, 125, -70, 126, -123, 53, 91, -114, 45, - -110, -3, -75, -42, 45, -65, -69, -12, 3, 90, 123, 1, 30, 112, 10, 72, 8, 70, 65, - 34, 110, -83, -73, -55, 117, 13, 39, 102, -81, -112, 32, -33, 37, -56, 10, -109, -39, -115, - 108, 100, 23, -3, 31, -63, -84, 36, 73, -43, 57, 110, 91, -87, 33, -8, -3, -52, -122, - 54, -76, -80, 2, 26, -17, -95, -10, 34, 34, 47, 71, 119, -59, -35, 118, 120, 28, -86, - 22, -32, 84, 12, -48, 34, -96, -98, -70, 60, 44, -59, 53, -5, 104, -84, 64, -63, 38, - 118, 6, -85, -118, -34, -92, 63, -12, 97, -74, 102, 112, 70, -19, 106, -74, -44, -69, 59, - 91, 37, -82, -111, 69, 106, -88, -112, 120, 11, -112, 45, 0, 22, -11, 27, -120, -128, -54, - -100, 82, -63, -108, 57, 73, 101, 88, -94, 84, 74, -1, 97, -42, -122, 82, -72, -67, -28, - -97, 111, 16, 113, -121, -87, -53, 32, 24, -18, 39, -10, -107, 75, 62, 1, 11, 12, 56, - -61, -43, -23, 44, -33, -62, -5, 56, -57, -10, -37, 36, 7, -45, -12, 103, -43, -35, -92, - 103, -57, 41, 3, -114, -84, 45, 73, -66, 14, -40, 96, -12, 87, -44, -92, -38, 44, 29, - -18, -14, -118, 57, -17, -95, -52, 46, 43, 8, -99, 99, -7, 96, 46, 3, 68, 101, 40, - 93, 16, 25, 22, 109, 47, -25, 54, 22, 5, -71, -33, -117, 98, -107, 38, 53, -120, -78, - 58, -127, -92, 81, 35, -14, 53, -38, -97, -16, -3, 118, -40, 14, -24, 72, 37, 22, -104, - -84, -107, -97, 13, -42, 83, 94, -54, 20, -38, -116, 17, -22, -51, -65, -47, -86, -65, 125, - 61, -85, -47, 64, -2, 35, 66, 64, 49, 66, 53, -76, -41, 11, -27, -70, 27, 68, 97, - -74, 62, -101, 56, 124, 29, 6, -102, 97, -26, 32, 123, 52, -123, -35, -23, -121, -124, 42, - 7, 34, 13, 78, 68, 99, -110, -89, 18, -48, -36, -124, 28, -108, 117, -43, 22, -52, -124, - -126, 13, -80, 49, 75, -9, -70, 34, 22, -90, -41, -93, 32, -123, -60, 35, -92, 117, 16, - -20, 33, -58, 124, -58, 23, -125, -116, 69, 77, 45, 8, 114, -10, 34, -86, 21, 105, 34, - -106, -75, -111, 25, -99, 25, -119, -43, 82, -99, -122, 30, -35, 64, 85, 26, -109, -8, 16, - -63, 77, 40, -59, -79, -51, 104, -127, 34, -25, -113, -8, 57, 67, 80, 45, 118, -12, 89, - 55, -114, -19, -17, -48, -108, 32, 90, 47, 98, 5, -37, 87, -109, 44, -112, 18, 126, 35, - -39, 81, 88, 82, 77, 92, 15, 85, 17, 100, 84, -46, -44, -6, -110, 60, 38, -4, -83, - 33, 32, 35, -11, -53, -96, 120, 26, -60, -118, -50, -111, 111, 123, 41, 27, 113, -43, -62, - 5, -42, -97, 17, -104, 70, 107, -23, 56, -41, -35, 95, -118, -24, 125, -65, -13, 45, -77, - 96, -96, -63, 42, -29, 49, 6, 9, 125, 40, 36, 85, 11, 9, 58, -18, -122, 67, -13, - -78, 71, 68, 23, -94, 34, -86, 38, 75, -50, -17, 114, 10, 14, -93, -88, -22, 38, 108, - 30, 50, -123, -118, 59, 6, 86, 74, 15, 75, -78, 66, -28, 123, -70, -77, -72, 22, -108, - -32, 123, -108, -102, -114, 47, -65, 102, -123, 66, 90, -21, 124, -62, -19, 83, 121, -10, 70, - -117, -27, 84, 14, 47, 16, -85, 71, -18, 41, 26, -11, 13, -127, -32, -108, 12, -56, -22, - -42, 19, 86, 45, 30, 10, -85, -77, -80, 14, -90, -96, 96, -72, 17, -62, -76, 113, -36, - 4, 86, -74, 82, -54, -87, 74, 52, -74, -18, -127, 15, 5, 76, 47, 80, 125, 4, 35, - -56, -95, -23, 14, 34, -53, -45, 67, 117, 120, -31, -118, -28, -51, -96, -114, 105, 104, 103, - 110, 122, 78, -78, 103, 51, -51, 69, 5, 62, -54, -97, -65, -104, 112, -30, -46, 73, 51, - 14, 125, -23, 60, -46, -51, -27, -79, -74, 86, 58, 115, 87, -64, -67, 16, -54, -113, -88, - -40, -23, -64, 117, 72, -124, -6, 79, 110, 65, 83, 96, -54, -96, 30, 70, -77, 7, 2, - -59, 33, 118, 88, 9, 58, -63, 23, 126, 48, 126, 61, -29, 32, -16, -28, -104, -108, 1, - -50, -82, -19, 50, 61, -23, -124, -4, -48, 114, 69, -78, 27, 110, 60, 56, -127, 92, 62, - 19, -41, -89, 118, 91, 19, -103, 73, -1, 125, 11, -25, -66, 83, -9, 15, 92, -18, 52, - 65, -73, -84, 71, 102, 126, -30, 72, -11, -26, -62, 23, -19, 101, -50, 8, 32, 16, 50, - -81, 82, -127, 25, 105, -76, 83, -115, 8, 6, 31, -29, 43, -17, -126, -64, -111, -13, 125, - -94, -92, -12, 16, -61, 53, 19, 29, 72, 24, 12, -4, -23, 25, -53, -116, 31, 60, 63, - -72, -82, 124, -52, -44, 117, -46, 70, -42, 5, 113, -74, 32, 5, -90, -70, -4, -17, -67, - -67, -26, -121, -10, -15, -3, -117, 92, -120, 10, 119, 84, -5, -93, -32, -30, -98, -99, -108, - 76, -67, -6, 11, 122, 21, 12, 108, 38, -104, -114, -94, -8, 12, 44, -16, 3, -104, 17, - 95, 49, 46, 105, 7, 92, 125, 10, 22, 47, 125, -63, -108, -9, 115, -15, 10, -86, 65, - 69, -33, 109, 3, 77, 111, 21, -84, 53, 31, 5, -5, -94, -79, 17, -87, 60, -27, 37, - -56, 74, -126, 6, 86, 54, 68, -73, -33, 89, 34, 105, 14, -101, 7, -42, -21, 0, -107, - -47, -123, -44, 0, 119, 40, -118, -115, 42, 14, 111, 83, 64, -7, 36, -65, -113, 31, 94, - 47, -119, -101, 116, 46, 107, 118, 67, 100, 21, -77, -113, 66, -48, 6, 92, 28, -127, 96, - -26, 35, -9, -50, 53, 41, -126, 107, -67, 67, 45, -108, 124, -107, -12, 113, -19, 68, -6, - 126, -74, 25, 2, -35, 69, -16, 7, 48, -103, 79, 76, 95, -34, -98, -55, 8, 10, -40, - 48, -80, 66, -61, -14, 111, -94, 76, -66, -97, 51, 6, -114, -60, 71, 40, -74, -105, -72, - -3, -8, 64, -80, -65, 111, -62, 84, -114, -81, 89, -43, 107, 38, -53, -1, -18, -61, -84, - 85, -45, -58, -74, -25, -83, 113, 117, 119, 9, 118, -117, -90, -70, -65, 36, -80, 92, 111, - -120, -27, -88, -43, -36, -3, 95, 25, -29, 19, 23, -86, -67, 52, 10, -125, 120, -122, 34, - -64, -60, -109, 71, -56, 24, 4, 85, -122, -6, 82, 126, -16, -91, -92, 2, -64, -83, -4, - 59, 82, 102, -1, -53, 103, -34, -57, 126, -63, -34, -17, 57, 38, 78, -65, 115, 29, -106, - -40, 14, 72, -58, -74, 111, 75, 61, -105, 70, 115, -124, 64, 78, -82, -55, 62, -31, 33, - 27, -126, 120, 90, -59, -78, 86, 53, -78, -116, 117, -58, 59, -25, -69, -45, 29, -38, -68, - 58, -30, 58, -104, -1, 103, -121, 84, 30, -127, 86, 0, 68, -23, -24, -32, 70, -15, 9, - 43, -12, 9, -78, 18, -26, 82, 90, 3, -42, -101, -86, -32, -10, -94, 91, 2, 70, -7, - 76, 94, -128, -23, -24, -76, -97, -76, 126, -102, 54, -57, 17, 115, -73, 36, 12, -56, -104, - 93, -8, -122, -51, 92, 21, -59, -50, -30, 68, 92, -51, -15, 93, 29, 60, -41, 98, -24, - 117, -107, -24, 108, -99, 98, -20, 57, 64, 52, 60, 47, -7, -102, 110, 79, 124, 119, -60, - 8, 63, -10, -12, 35, 40, 40, -61, 103, -11, -125, -56, -81, -36, 102, -37, -82, 42, 80, - 14, -52, -127, -12, -121, -26, -42, -67, 31, 82, 120, -35, -5, 34, 19, -35, 95, -117, 85, - 74, 112, -123, -112, -93, -70, -109, -123, 99, -66, -127, -98, 84, 48, -94, 78, -44, -98, 124, - -72, 31, -93, 8, 28, -110, 9, 47, -17, 58, 64, 78, 64, 7, -25, -81, 70, 44, 0, - 68, 102, -40, -24, -116, 58, -57, -59, -67, -9, 50, 0, -6, 106, -2, -36, 4, 123, -111, - 80, 14, -89, -59, 49, -29, -56, 119, -64, -51, 125, 19, 36, 73, 89, 46, 72, -87, 46, - 120, -110, -52, 94, -108, -54, -4, 119, -70, 24, -112, -66, -58, -97, 114, 71, -105, -19, -76, - 98, -103, 75, 6, 100, -65, 56, 86, -49, 2, 68, 111, 15, 119, 24, -83, -92, -51, 50, - 126, -28, 84, 58, 42, 58, 47, 24, 95, -66, -124, -85, -53, 98, -108, 123, 107, -124, 27, - 12, -13, 80, 23, -102, 18, 100, -128, 14, -62, 23, 70, -108, 41, -6, 38, 39, -100, -97, - 59, -105, -85, -123, -35, -31, -46, -109, 1, -22, -38, 18, -127, 118, -46, -104, 4, 120, 45, - -67, -65, 24, 83, 99, 46, -5, -128, 19, -87, 52, 55, -66, -105, 93, 100, 85, 94, 102, - 83, -77, 19, -117, 87, 48, -76, 46, -56, 67, -71, 34, 110, -4, 46, 88, -81, 89, 94, - -83, -94, 57, 4, 91, 21, -45, 19, 87, 26, -97, 105, -28, -62, -20, 76, 48, 12, 81, - -26, 41, -25, 79, -17, -105, 2, 82, 0, 97, -95, 82, -123, 41, 102, -1, -98, -126, 80, - -33, 48, -96, -119, -113, 37, 105, -114, 116, -59, -31, 2, -16, -42, -76, 125, -30, -29, -90, - -63, -39, 126, -38, -11, -3, -78, -33, -31, 14, -7, 85, -99, 117, 25, 76, -58, 1, -70, - 33, -94, 102, 41, 86, 83, -98, 102, -76, -119, 92, 62, 104, -104, -32, -38, 113, -96, -16, - -57, -84, 103, 90, -15, 51, 72, 65, 112, -67, -18, 122, 4, -66, -13, -90, -93, 41, -35, - -115, 107, -49, -119, 66, -80, -108, -99, -47, -1, 104, -20, -124, -55, -27, -123, -30, -116, -5, - 72, -30, -75, -103, 55, -46, 87, 123, -62, -68, 26, 110, -26, -100, -4, -25, -14, 61, 87, - 79, 119, 41, -92, -119, -18, 68, 95, -110, 69, -94, -92, -105, -97, 94, -53, -119, -127, -95, - -107, 89, -23, 75, 17, 126, 126, -118, -59, -113, -82, -91, 55, -81, 113, 74, -21, 100, -101, - 24, 79, 114, 32, -29, 74, 84, 126, -62, 118, 8, -42, 7, 38, 92, 37, -118, -50, -39, - 118, 118, -30, -2, -73, -89, -21, 104, -51, -60, 109, 21, -112, 122, 8, -79, -95, 53, -57, - 126, 85, 79, 66, -83, -44, -16, -109, 13, 103, 68, -64, -122, 73, -82, 86, -37, -16, -57, - 5, -65, -105, -37, 33, 75, 99, 120, 59, -80, -5, 99, -123, -17, -40, 63, -111, -95, 97, - 72, -121, 8, -110, 6, 98, -62, -64, 43, -34, 5, -3, -30, 60, 121, 96, -51, -62, -43, - 115, 82, 1, -1, -10, -39, -89, -32, -63, -100, 89, -22, 117, -80, -30, -7, 7, 47, -6, - -119, 32, 44, 55, 106, -98, 18, 88, -32, 60, -83, 50, -89, -38, -76, 19, 35, 106, -35, - 73, -38, -29, -109, 78, -21, 29, 50, -18, -6, 6, 26, 99, -96, -86, -70, -95, 111, -117, - -54, 31, -107, -64, -15, 70, -93, -41, -128, 66, -105, -65, 70, -110, -71, -127, 92, -79, 5, - 30, 81, 67, -53, -1, -120, -65, 122, 77, -120, -51, 73, -109, 104, 105, 23, -102, -39, 69, - 55, 65, 4, -113, -110, 68, -17, 19, 108, 124, -37, -55, 11, -110, -128, 3, 29, 16, 62, - 91, 114, 55, 61, 87, 82, 18, 8, -33, 1, -15, -15, 4, -89, 10, 72, 64, -94, 71, - 99, 36, 37, 53, 113, 73, 53, -57, 25, 110, -48, -122, 6, -113, 3, -83, -33, 96, -25, - -72, 71, -125, 108, -26, -118, 49, 106, 2, 37, -60, -62, -75, -109, 91, 50, 15, -55, -2, - 67, -128, 103, 21, 20, 70, 35, -50, -43, 81, 25, -13, -122, 54, -50, -55, -25, -112, 96, - -112, 44, 74, 15, 124, -78, 114, -80, -126, -11, -77, -121, 21, -71, -1, -63, -124, -128, -41, - 10, 34, 118, 123, -38, -15, -67, 120, 34, 26, 19, -3, -4, -15, 39, -37, 54, 126, -2, - 113, -13, 108, 2, 9, 86, 30, -69, -124, 42, -82, -119, 14, 85, 78, -118, 114, -69, 73, - -23, 108, -108, 50, 58, 67, 81, -48, 86, -51, 125, 32, -4, 99, 74, -6, 98, 32, 72, - -16, 113, -47, -50, 31, -125, -89, -22, -14, -97, -104, 113, 31, 121, 33, 126, 22, -92, 36, - -125, 40, -111, 23, -119, -8, 60, -30, 4, 9, 82, 56, -91, -1, -23, 54, 84, 53, -126, - 2, 23, 104, 91, 43, -54, 108, 84, 24, 29, 125, -89, -36, 92, 34, -124, -24, -93, -120, - -36, -52, -102, 42, 92, -67, -43, -3, 70, 19, 64, 10, 112, -97, -38, -114, 54, -6, 52, - -27, -84, 118, 124, -71, 74, 99, 30, 124, -57, 123, 82, 59, 122, 9, -36, -108, -31, -88, - 105, 21, 99, -100, -100, -107, 75, 22, -62, 80, -35, 89, -83, -117, 80, -74, -61, 16, -21, - -91, 88, 107, -79, -92, -46, -105, -61, -77, -46, -82, -127, 36, 77, 96, -88, 100, -48, 11, - 74, 29, -35, -37, 89, -118, -121, -17, -26, -8, 20, 74, -42, -15, 50, 60, -46, -82, -119, - 21, 43, 107, -84, 105, 40, -67, 47, -45, -13, 17, 72, 93, 79, -97, 66, 22, -121, 29, - 59, 11, -50, -109, -88, 111, -35, 46, -116, 103, 90, -84, -105, 59, -126, 28, 66, 46, 7, - -7, 8, -98, 111, -37, -85, -24, -71, 1, 73, -41, 88, 90, 109, 65, -49, -49, -107, -119, - -86, -119, -58, 17, 100, -50, -90, -123, 30, -105, 25, 40, 111, 104, -5, 106, -70, 78, 91, - -28, 98, -44, -49, -1, 15, 88, -105, -52, 120, -50, 29, -11, -121, -113, 53, -78, 42, -97, - -64, -105, 75, 42, -33, -107, -23, 102, -67, 37, 92, -121, -90, -92, -46, -119, 122, 30, 116, - 77, -111, 111, 46, -72, 36, 63, 46, 75, -36, -2, 5, -2, -33, 78, 37, 90, -61, 117, - -120, -80, 114, -29, 126, -94, 83, -57, 98, 46, -124, 35, 79, -43, -63, -60, 102, 49, -14, - 69, -39, -110, -75, -114, 108, -114, 93, 14, 122, -18, 18, -46, 32, -74, 108, 85, -93, 121, - -19, -63, -4, 116, 67, 83, 96, 35, -126, -122, 12, 106, 95, -100, 77, 6, -79, -93, 66, - -115, -64, 5, -66, -128, -18, -27, -67, -119, -109, -75, -105, 58, 36, 113, 39, -107, -23, -19, - 64, 18, 25, -28, -68, 31, -52, 71, -82, 124, 10, 34, -82, 21, -104, 58, -44, 5, -28, - 27, -71, 118, 30, -48, 15, 27, 35, 97, 61, 65, 9, 55, -119, -99, -114, -97, 85, -64, - -109, -55, 53, 76, -30, -107, 30, 88, 29, 9, 59, -14, -62, 106, 50, -16, -93, -105, -121, - 7, 42, -50, 18, -52, 120, -81, 35, 44, -60, 8, -69, 59, -125, -119, -17, 15, -88, 54, - 77, -10, -9, 6, 21, -45, -85, 59, -7, 110, 15, -96, 109, -16, 87, 58, -108, -67, 10, - -7, 77, 11, 76, -36, 74, -72, 49, -106, 112, -125, -75, 0, 20, 126, -87, -34, -90, -124, - 38, 65, -77, 57, -82, 11, 2, 94, -124, -50, 48, 15, 124, -20, 6, -69, 81, -33, -21, - -65, 97, 95, -33, 72, -67, -1, 8, 91, 36, -26, -21, -29, 7, -16, 27, 70, -127, 27, - -78, -9, 77, -95, 52, -77, 77, -21, -121, -56, 53, 39, 36, 110, -101, -109, 11, 20, 40, - 66, -72, -53, -39, -32, -64, 37, 79, 108, -107, -109, 98, -76, -11, 84, 16, -98, -107, 17, - -99, -73, 35, -68, -118, 50, 15, 123, -4, -26, 26, -29, -69, -105, 106, 53, 74, 99, -99, - 43, -34, 24, 93, 119, 7, -4, 67, -22, 77, 101, -14, -88, -96, 48, -75, -64, -39, 6, - -43, -36, 39, -108, -69, -117, -29, -79, 43, -13, 37, 2, 83, -103, 24, 5, 55, -9, -113, - -105, 100, -107, -89, -114, 35, -43, -45, 36, 3, 47, 37, 114, -39, 78, 83, 4, 109, -69, - 66, -8, -90, 3, 19, -26, -91, 61, 51, -7, -51, 84, -1, -34, -60, 58, 49, -75, 59, - -51, -77, -110, 36, 47, -103, -74, -98, 8, 88, -4, -80, -22, 116, -82, -83, 59, 61, -37, - 21, 45, 106, -36, 123, 20, 14, -108, 78, -32, -53, -41, -95, -81, -6, 84, 113, -68, 29, - -31, 27, 26, -44, -12, -88, -15, 80, -120, 49, -51, -44, -12, 109, -39, -21, 30, 57, -30, - -10, -40, -59, -126, 25, 14, 28, -26, -42, 9, -123, 18, 16, 63, 58, -5, -27, 94, 88, - -27, -31, 93, -32, -73, -76, 79, -63, -56, 24, -122, -31, -125, -127, -15, -96, 82, -124, -22, - 9, -115, 4, -23, 79, 45, -3, 57, -40, 121, -125, -103, -21, -101, -12, -65, -53, -128, -114, - -32, 37, -84, -2, 76, 121, 13, -119, 117, 54, -124, 41, 1, -59, 77, 25, 63, 49, -56, - 4, -9, -120, -11, -109, -80, 79, 98, 9, -17, 7, -70, 0, -120, 117, -19, -102, -39, 89, - 54, -86, 123, 2, -44, -85, -66, -56, -124, 44, -109, -15, -52, -58, -7, -79, 2, -34, 38, - 41, 54, -78, -116, -108, -7, -43, -33, -64, 115, -109, -52, -82, -22, -57, 40, 119, -80, -103, - 55, 75, -86, -62, 125, 4, -1, 108, -56, -71, 89, 91, -7, -2, 47, 90, 95, 47, 9, - -125, 22, -3, -56, 83, 47, 36, 82, -101, -122, 57, -41, 30, 48, -43, 67, -27, -95, 50, - -65, 40, 91, -100, -42, -103, 73, 24, 62, -18, -83, 94, -110, -46, -19, -21, 59, -119, -83, - 89, 8, 105, 82, 99, 31, -86, 99, -84, -19, -14, -2, 108, 105, -20, 14, 82, 98, -39, - -51, 67, -113, -2, 94, -65, -11, -12, 103, -118, -107, 45, -101, -57, 1, 82, 126, 56, -69, - -112, 41, 12, -47, -97, -74, 96, -80, 121, -74, 14, -64, -64, 28, -114, 93, 113, 122, -7, - 94, -106, 112, -53, -65, -6, -121, 48, -19, -16, -56, -25, -107, 54, -24, -11, 63, 29, -51, - 124, -96, -46, -109, -48, -35, -105, -75, -64, -89, -92, -118, 25, 10, 121, 36, 77, 102, -113, - 58, -106, 117, 72, 15, 94, -99, 79, -41, -111, -114, -76, -6, 21, -65, 38, -46, 45, 52, - 77, 6, 110, -119, -117, -78, 105, 118, 55, -19, 26, 92, 86, -26, 86, -23, 41, 29, -74, - 13, 33, -14, -8, 23, 56, 84, -95, 72, 65, 120, -115, 98, -36, -17, -92, -118, -34, 92, - 1, -24, -110, 56, -76, -113, -116, -110, -122, 60, -63, 55, 114, 110, 31, 19, -107, -90, -97, - 82, 55, 19, -108, 17, 126, -43, 123, -18, -27, -123, 106, 41, -98, 50, 6, 97, -51, -78, - -112, -61, -46, -59, -42, -5, -75, -127, 10, 67, 18, -75, 101, -80, -69, -2, -116, -3, 35, - 53, 92, -25, -11, 117, 47, 14, -66, -92, 101, 78, -25, 57, 78, 44, 10, 11, 69, 70, - 92, 56, 67, -88, 17, -39, 107, -64, -56, 99, 19, 86, -4, -44, -47, 73, -70, -107, -109, - -42, -34, 19, -46, 70, -41, 4, 10, -83, 78, -2, 22, -62, -18, 57, -29, -118, 119, -82, - -21, -125, 40, 95, 23, 3, -80, 123, 23, 13, -23, -90, -122, 91, -36, -118, 63, -20, 3, - -122, 14, -113, 54, -106, -22, 116, 111, -86, -60, 76, 25, 33, 18, 106, 116, -59, 96, 53, - 21, -112, -23, 78, -92, -16, 125, -35, -31, 119, -39, -60, 66, 90, 98, -89, 55, -108, -63, - 15, -71, 51, 114, -39, -95, -121, -106, -106, -43, -37, -75, -57, -23, -118, -45, -30, -86, -75, - -76, 116, -75, -84, -5, -96, 44, -91, 8, 47, -52, 32, -45, 96, 59, -119, 95, -2, -119, - -45, -19, -117, -64, -90, 67, -82, 51, 23, 75, -70, 10, -76, 16, 39, 34, 48, 100, -118, - -41, -69, 76, 53, -128, -53, -82, -82, -106, 38, -72, -57, 59, 113, -107, -114, -77, 14, -26, - 7, -72, -20, 121, -39, 87, -53, 62, -70, 12, 53, -40, -92, 51, 60, -55, 41, 54, 98, - -28, -76, 124, 102, 30, 119, 100, -22, 109, 90, -121, 43, -102, -90, -68, -3, -83, -63, 107, - -55, 25, -72, -24, 29, 97, -6, -99, 18, -40, -19, -76, -104, -11, -97, 89, 101, 61, -7, - 68, 47, 97, -91, -8, -44, -25, -44, -83, 100, -107, 72, 64, -22, -25, -103, -107, -85, -70, - 115, 48, -55, -5, -71, 114, -34, 32, 75, -7, 21, 123, 120, 10, 73, -40, 51, -93, 2, - -121, -39, 81, -33, 36, -62, -55, -55, -104, 70, 66, -17, 72, -36, 122, -90, 111, -91, -9, - -80, -98, 116, -37, 91, 102, 86, 58, -84, 83, -61, -124, -123, 10, 62, -121, -15, 107, 85, - 113, -17, -61, 69, 74, 2, -83, -33, -121, 105, -53, 64, 65, -61, 72, 78, -103, 105, 34, - 77, 80, 97, 30, -43, 84, -89, -91, -3, 23, -69, -67, -72, 124, 28, -9, -75, -26, 70, - -108, -8, -40, 39, -2, 11, -49, -68, 89, -35, -110, 41, -110, -54, -82, 48, 58, -28, 110, - -61, -117, 22, -22, -81, 77, 56, -55, -106, 9, 101, -122, -42, 58, 64, -52, -100, -58, 3, - 24, 104, 39, 31, -21, 43, -34, -108, -13, 36, -39, 123, -24, 110, 93, -94, -70, -1, -85, - 121, -74, -99, -30, 98, -69, 78, 19, -22, -74, -56, 7, -43, 111, 90, 4, 114, -18, -2, - -70, -95, 93, -89, -55, 51, -97, 35, 43, 63, 72, -82, 108, 104, -38, 36, 42, 83, 31, - -48, 39, -41, -6, 74, -106, -55, -66, 66, -25, -99, -24, -99, -31, 49, 114, -47, -22, -78, - -2, -73, -100, -94, 89, 23, -112, -5, 53, -101, 20, -35, 39, 41, -65, -29, 99, 88, 7, - -15, 100, -76, 18, 114, -14, 100, -111, -103, -104, 119, 52, -79, -46, -99, -46, 72, 121, -23, - 95, -114, -88, -45, -10, -102, -4, 105, 25, -43, 100, -99, -45, 94, 78, -14, -30, -40, 65, - 27, 36, 55, 33, 79, 48, -13, 68, -8, -55, 122, 16, 35, 98, -63, -94, 97, 12, -78, - -58, 116, -64, -107, -124, -59, 49, -115, 104, -121, -107, -79, -68, -67, -68, -14, 45, 95, 100, - -17, 32, -47, 41, -40, -91, -106, -18, 51, -21, -126, 125, 16, -38, 51, -114, 89, -55, 100, - 73, 105, -36, -115, 119, 54, -8, 55, -76, -46, 74, -83, 56, -29, 106, -63, -50, 83, 42, - -88, 101, -50, 32, 107, 49, -123, 64, -28, -120, 94, 123, 16, -27, -44, 75, 21, -77, -6, - 35, -118, -70, 16, 98, -64, -47, 110, -114, 18, -40, -63, 115, -20, 1, 82, -93, -11, 2, - -52, -55, 39, -114, 34, -59, -105, 113, 16, -8, -45, 75, 85, -70, 123, -120, 87, 90, -46, - -36, 0, 114, 42, 122, 105, 61, -7, 54, -119, 7, -21, -123, 107, -27, 11, -101, 118, -48, - -68, -34, 54, -39, -86, -7, -90, 109, 40, -121, -20, -2, 91, -76, 51, 30, -36, -117, 75, - -114, 73, 57, 83, -39, -50, 21, 77, 45, 31, -10, -36, -33, 108, -50, 19, -93, -19, -66, - -70, 32, -72, 37, -112, 74, -51, -100, -66, -13, 38, 41, -55, -36, -28, 87, 53, -95, -118, - -127, -111, 120, -110, 5, 23, -71, 120, 14, 20, -64, 77, 80, -40, 99, 123, 7, -64, 120, - 111, -59, 97, 1, -71, 100, -41, 118, -81, 108, -102, -112, -77, -20, 44, 33, -56, -76, -41, - 21, -23, 14, -9, 80, -80, -25, 19, 66, 78, -61, 114, -36, 16, -62, -77, 27, 49, 122, - -86, 114, 32, 26, -26, 43, 113, 5, -77, 73, 100, -108, -74, 50, 53, 96, -2, -109, -48, - 71, 73, 2, -41, 117, 105, -69, 67, -122, -32, -105, -73, 31, -25, 52, 101, -30, 63, -63, - 19, 48, 22, -1, -41, -81, 101, -80, 123, -88, -67, 38, 26, -74, 122, -86, -11, -105, 125, - 38, -4, 33, 9, -74, -57, 66, 27, 92, -101, -58, 76, -80, -81, 30, -125, 53, 86, -70, - 81, 115, 17, 2, 118, 104, -113, -102, 57, 120, -52, 0, 22, -66, -81, -70, 79, 68, 89, - 112, -88, -99, -66, -43, 19, 2, -24, -67, -75, 54, -11, -68, -42, -25, 90, 123, 28, -91, - -13, -67, 92, -94, -67, -28, -11, -79, -53, -51, 110, 10, 67, 20, 86, -24, 126, -97, -120, - -76, -45, 81, 4, -66, -30, -50, 111, -30, 5, 67, -19, -96, 121, -25, -114, 93, 81, -91, - -84, -86, -113, 1, 108, -53, 80, 59, 56, 97, -79, -17, 71, -26, -105, 94, 25, -69, -75, - 103, 34, 38, 27, -100, 93, -59, 4, -108, 65, 123, 108, -124, 37, -103, 69, 92, -3, 62, - -70, 82, 118, 37, -83, 119, 123, -97, -68, 125, 106, 21, -45, -36, -71, -87, -5, 119, 106, - 84, -23, 44, -85, -113, 77, 66, -66, 105, 104, 87, 118, 109, -99, -49, 122, 123, 13, -32, - 18, 94, -26, -86, -82, -111, -43, 110, -66, -121, -22, 116, -32, 120, 123, -83, 105, -115, 16, - -89, -47, 102, -35, -19, 37, -89, 61, -106, 111, 84, -62, 46, 54, 118, -102, 41, 6, -65, - -44, 113, 58, 74, -68, -28, 22, -108, 119, -94, -4, -11, -125, -26, 68, -7, -53, -78, 81, - 43, 27, -19, -86, -107, 98, 46, 62, 49, -107, 15, 26, 14, -29, 92, 125, -80, -114, -113, - -61, 19, -107, 18, -41, -37, -59, -19, -9, -29, 10, 71, -36, -93, 30, 73, -52, -29, 14, - -79, -3, 113, -92, 0, -44, -1, -55, 2, -73, -28, 70, 9, -15, 4, 68, -60, -65, -47, - -42, -103, -11, -56, -91, 84, 34, -66, -61, 52, 97, -11, 68, -65, 86, -63, -72, -58, -14, - 81, 78, -59, 105, -40, 72, 53, -2, 86, -102, 118, 70, -102, 84, -4, 125, -67, -94, -92, - 108, -18, 87, -99, 2, 40, 51, 37, -115, 118, -7, 58, 90, -70, 121, -26, 74, 79, -82, - 43, 6, 90, -57, -30, 6, -125, 67, 79, -88, 81, 66, -95, -6, 120, 119, 101, -98, -11, - 20, -84, -78, -50, 120, 0, -41, 59, -1, -31, 48, 51, -2, -87, -66, 27, 114, -112, 67, - -41, 57, 41, 6, -100, 42, -33, 46, 86, 123, -108, 86, 90, 27, -76, 19, 40, -61, 33, - -112, 63, -128, -43, -10, 11, -80, -4, 35, -106, -17, 85, -12, 98, -42, -115, -57, -10, -34, - 39, 81, -84, -113, -3, -128, -101, -21, 24, 62, 125, -40, -95, 51, 79, -23, 99, 17, -58, - 14, -109, 37, 107, -25, -66, -45, -50, -126, -65, -83, -33, 122, 30, -33, 105, 47, -86, -71, - -17, 98, 66, -28, 15, -13, 38, -11, -49, 36, -128, 126, -42, 69, 122, -84, 28, -121, -108, - -22, 96, -2, -16, 72, 110, 19, -97, 15, 76, 53, -24, -82, -104, -34, 7, -124, 49, 30, - -111, -41, -92, 125, 21, -106, -92, 24, -43, 66, 117, -54, -68, -22, -101, -60, -52, 91, -52, - 90, -83, -93, 39, -84, -51, -13, 41, 18, -45, -38, -23, -67, 97, -35, 116, 105, -26, 44, - -5, 96, 43, -16, 62, -51, 88, 5, 59, -98, 57, -108, 106, -11, 5, -18, -21, 70, 111, - -125, -99, 32, -85, -23, -64, 5, 121, 1, -57, 37, 18, 104, -126, -21, -115, 103, -7, 30, - 91, -121, -114, 19, -11, 115, -18, 94, -110, -78, 99, -10, 5, -98, 67, 44, -110, 57, -71, - -39, -103, -28, 108, -29, -25, 80, -97, 126, -93, 32, -69, -71, 38, -7, -74, 24, -10, 29, - 83, 62, 92, 65, 14, -126, -13, 120, 102, 64, -84, 24, 100, 118, 76, 88, -101, -116, -48, - 99, -62, -124, 1, 112, 28, -79, -52, 3, 64, -73, 25, -28, -79, 8, 6, 22, -13, 76, - -116, -101, 64, -16, -49, -106, -100, -33, -5, -87, -64, 18, 65, 100, 90, 4, -78, 47, -43, - -118, 70, 114, -10, 17, 87, -122, -33, -107, -124, 43, -10, -87, 83, 47, -3, 29, 89, -9, - -77, -103, 123, 33, 60, 0, 0, -6, -92, 58, 116, -59, -80, 30, 0, -91, 84, 81, -37, - 112, -64, -73, -127, 66, 10, 22, 25, 56, -109, -92, -25, -40, 86, -49, 56, -16, -12, -66, - -38, -56, 83, -111, -1, 89, -84, 98, 113, 66, -61, -87, 86, -52, 63, 4, 18, -18, -84, - -40, -11, -115, 105, 118, -98, -107, 104, -87, 84, 48, 23, 62, -59, 98, 25, -31, -44, 120, - -26, 63, 22, -58, 108, -5, 46, 84, -70, -20, -30, 115, -103, -70, -86, -42, 8, -79, 111, - 121, -20, 88, -103, -106, 37, -80, 29, 40, -116, -128, 1, 112, 63, -40, 62, 105, 119, -13, - 119, 52, -8, 82, -48, 0, 70, -105, 125, -17, -78, 116, -71, 39, 24, 73, 2, -45, -105, - -104, -112, 102, -31, -56, 125, -93, 26, 54, 121, 100, -53, -59, 15, 2, -47, -96, -53, -48, - -13, 43, -18, -90, -18, -10, 75, 96, -126, -115, 34, -23, -78, -34, -80, 39, -103, -26, -33, - 21, 55, 77, -95, -120, 53, -63, 31, -40, 75, -127, -46, 49, 5, 45, 2, -112, -29, 27, - -34, -73, 40, -89, -99, -1, -54, -100, 58, 99, 52, -94, -108, -46, 86, 110, -76, -8, 55, - -19, -47, -49, -97, 78, 95, -16, -54, -8, 18, -83, 19, -43, 113, -16, -118, -53, -95, -40, - -28, -31, 90, 24, -115, -86, -119, 124, 78, 10, 62, -2, 100, 102, 85, -30, -10, -62, 87, - -10, -124, 117, 28, -90, 33, -56, -74, 41, -34, -43, 9, 48, -126, 121, -111, -32, -105, -41, - 60, -103, -62, -3, 112, -45, 119, -3, 24, 12, 113, -97, -25, -17, 53, -98, -99, 117, -6, - -19, -70, -62, 61, 5, 88, -105, 93, -98, 115, -66, -27, -56, -115, -73, -19, -36, 103, 101, - 16, -22, -16, 103, -18, -109, -31, 43, -91, -66, 54, -20, 50, -118, 102, 10, 36, -58, -115, - 118, -3, 39, 79, 74, 19, 90, -9, 11, 3, 88, -31, -3, 91, 90, -15, -108, 28, 23, - -84, -44, -31, 4, -83, 91, -87, -72, -111, 31, 90, 45, 22, 106, -82, 77, 80, -68, 49, - -12, 26, 114, -90, -24, -109, -125, 82, -114, 8, 44, 117, 125, 63, -109, -38, 84, -32, -112, - 68, -84, -122, -64, -100, -36, -117, 109, -108, 3, -2, 45, -44, 30, 46, 71, 27, -11, 5, - 63, -44, 85, 20, 125, 108, 114, 47, -95, -109, 59, 32, -19, -58, -117, 57, 91, 72, 68, - -25, -120, -75, 45, 8, 34, 79, 27, 49, -59, 47, 106, -58, 18, 28, -57, 103, 85, -44, - 110, 27, -115, 7, 107, 125, -5, -111, 16, 115, -93, 57, -11, -18, 44, -116, -75, -6, 29, - -121, -32, 70, 91, 47, -50, -41, -125, 51, -12, -42, -77, 44, -36, 1, 33, 48, -122, 65, - -100, 93, -33, -125, 79, -96, 38, 54, -30, -112, -104, 37, 68, 18, 54, 23, -58, 57, 44, - 7, 34, 19, 67, -31, 103, 96, -40, 52, 55, 50, -63, -35, 124, 3, 111, 30, -124, -34, - 6, -82, -42, -30, 114, 104, -105, 104, -106, 22, 77, -24, 0, 122, -42, -12, -69, 34, 66, - 14, 101, 24, 49, 35, -51, 88, 15, -94, 69, -108, -5, -13, 33, 13, -121, 109, 36, 12, - -9, -5, 37, -73, 114, -29, 36, 2, 7, -30, 40, -26, -76, -128, 21, 100, 28, -22, -105, - 16, 42, 19, -49, 42, 30, 114, -110, -117, -8, -3, -89, 27, 61, -87, -113, -41, -99, 17, - 71, 111, 61, -120, 11, -128, -48, 55, 3, -83, -117, -48, -8, 101, 83, 78, 9, -21, 102, - -93, 6, -79, -42, 99, 7, 18, 106, -128, -61, -86, 46, -75, -98, 96, 15, -65, -39, -124, - -118, 37, -16, -5, -26, 115, 108, 106, -69, 0, -34, -115, -94, 18, 122, 42, 9, -128, -22, - -80, -76, 111, 64, -13, 16, 4, -93, -32, -45, -58, -100, 113, -78, 20, 11, -87, 92, 78, - 102, -49, -88, -82, -25, 76, 110, 45, 32, -17, 40, -122, -43, 98, -18, -36, -30, -83, 80, - -90, 49, -85, 97, -113, 26, 94, -38, -88, -39, 107, -118, -25, -68, 13, -60, 91, 101, 123, - 29, 17, -83, 23, -91, 72, 62, 79, -113, 82, -124, 103, 93, -42, 8, -29, 98, 106, 101, - -41, -99, -44, -71, 14, 112, 2, 1, -110, -119, 0, -52, -34, 122, 15, -6, -126, -65, -45, - -122, 58, 49, 79, 53, -93, 121, 53, -97, 41, 81, -13, 19, 20, 82, -81, -57, -102, 49, - -12, 80, 120, 37, -91, 26, -118, 49, -77, -52, 122, -11, 5, -113, 32, -115, -114, -115, -24, - -43, -26, 115, 14, 111, 99, 8, 117, 99, 72, 47, -74, 43, 43, 113, 5, -112, 34, 121, - -84, -78, -108, -40, -13, 46, 84, 12, 34, -84, -89, -107, -113, -1, -15, -83, 85, -54, 88, - -122, -95, -17, 56, 67, 59, 30, 95, 37, -122, 105, 79, 123, 83, -85, -94, -128, -128, 107, - 62, -70, -91, -74, 55, 121, -48, -18, -70, -78, 41, -56, -30, 69, -111, 106, 34, 114, 111, - -29, 37, 118, -46, 24, -95, 31, 53, -10, -112, -99, 32, -7, 74, -94, 27, -53, 6, 42, - 52, 68, -24, -76, 58, 26, -114, -23, -107, -35, 93, 106, -28, 94, -93, 10, 80, 70, 75, - -115, 23, -17, -33, 72, -107, -47, -49, -65, 59, -51, -120, -83, 73, -80, 48, 37, 26, 89, - 28, 103, 97, -110, 61, -40, 2, -55, -38, 99, -117, 109, 105, 108, -60, -126, -49, 76, 38, - -88, -16, 39, -60, 103, 8, 16, -51, -18, 126, 69, -18, -100, 91, -75, -86, 86, -107, 27, - 88, 70, 22, 4, 119, 80, -73, -20, 49, -58, -119, 120, -15, -25, 47, 46, 24, -109, 42, - -48, 101, 108, 32, 18, 109, 85, 76, -30, 30, -99, -98, 96, -92, -36, -81, 22, -1, 54, - 77, -88, -33, -84, -126, -5, 36, -15, -64, 77, -70, -27, 21, 23, -105, 18, 88, -127, 63, - -37, -104, -36, -60, 90, 69, -122, 21, 62, -127, 62, 5, -22, 55, -116, -10, -7, -96, 4, - -82, 71, 49, 49, 30, 62, 18, 46, -12, -52, -91, 92, -85, 114, -54, -68, -123, -24, 33, - -40, 79, 62, -123, -63, 76, 54, 57, -121, 120, -69, 109, 49, 1, 53, 116, -79, -101, -106, - 50, 77, -38, -84, 114, -121, -122, -60, -59, 40, 67, 17, -63, 11, 40, 123, 84, -41, 62, - 28, 46, 73, 117, 25, -115, 124, -109, -35, -123, -74, 25, -108, -84, 72, 56, 82, -52, 19, - -44, -25, 62, 29, 94, -73, 94, 93, 99, 113, -72, -77, 55, 68, 31, 95, -100, 59, -87, - 98, 45, -23, 122, 64, 66, -47, 94, 82, -20, 102, 55, -107, 37, -98, -14, 46, 5, -7, - 85, 22, -30, -98, -113, -22, 34, 109, 59, -76, 87, 60, -107, 120, -111, -55, -35, -124, -56, - 53, 23, -1, -87, 39, 70, 17, -98, -7, -65, -108, -78, 6, -6, 21, -33, 109, 104, -53, - -46, -86, -128, 96, -40, -89, 87, 26, -69, 94, 100, 34, 28, -113, -61, 1, -121, 69, 39, - -106, -20, 41, 72, 92, -6, -25, 100, -83, -23, -103, -49, 23, -93, -79, 118, 91, 113, 44, - -118, -25, -39, -75, -106, -95, -38, 99, 33, -34, -115, 34, -13, -53, -50, 125, -101, -25, -115, - -110, 12, 48, 67, -29, -75, -16, 73, -115, -122, -78, -20, 85, -125, 89, 97, -68, -63, -15, - -70, 92, -22, -22, -108, -39, -48, 26, -18, -7, 75, 10, -23, -27, 64, -79, 51, -112, -21, - -113, -14, 61, 108, 41, -25, 58, 34, 115, 97, 45, 14, 7, -45, -8, -53, -97, 63, -122, - -42, -77, 23, -124, 46, -81, -116, 106, -107, -6, 64, 51, 55, 78, -107, -4, -94, 68, -36, - 68, -74, -113, -112, -36, 80, 110, 117, -28, 34, 117, -22, 16, -64, 28, 11, -108, 21, 78, - -76, 31, -80, 25, 2, 10, 49, -88, -59, -6, 39, -101, -96, 46, 9, 7, 23, 44, 64, - -103, -47, -95, -99, 56, -45, -127, -36, 125, -33, 31, -41, -60, -15, 94, -25, -122, -26, -108, - 109, -106, -2, -108, 63, 21, -48, -98, -15, -112, 21, 64, 111, -38, -24, 76, -34, 5, 27, - 10, 47, 57, -46, -54, 54, -27, 93, 115, -43, 116, -36, 75, -68, -115, -22, 114, 61, 72, - -20, 78, 52, 120, 57, 125, 60, 110, 59, 68, -47, 52, 97, -31, -102, -71, 10, 17, 100, - -75, -125, 19, -93, 56, 61, -41, -30, -116, -17, -49, -90, -23, -90, 12, 76, 65, -66, 97, - 28, -22, 31, 90, -77, -116, 80, 99, -108, -93, -31, -84, 79, 58, 86, 60, -101, -75, -116, - -49, 124, -90, -33, 32, 73, 93, 119, -116, -58, -95, -30, 11, -66, -13, -128, 92, -28, -26, - 50, -89, 23, -33, -23, 110, -9, -35, -1, -30, -102, 39, -51, -64, -83, -90, -90, 12, -32, - 32, 50, 122, -32, -6, -53, 40, -66, -19, 19, 47, -91, 20, 85, -96, 115, 84, -127, 3, - -48, -36, 36, -65, 85, -3, -71, 86, -9, -123, -77, 4, 103, -91, -84, 112, -35, 97, -7, - -91, 12, -12, 107, -54, 39, -110, 13, 70, 50, -94, -59, 82, -24, 106, 8, 64, -108, -124, - -54, -76, -13, 35, 107, 68, 100, -43, 0, -51, 117, -21, -110, 120, -91, 40, 64, -98, -29, - -95, -67, -57, 40, -9, -54, -35, -108, 69, -109, -67, -123, 75, 37, -113, -85, -121, -26, 93, - 91, -54, -105, 25, 44, 58, 9, 70, -9, 14, 88, 117, 72, -102, -107, 121, 115, 20, 94, - -84, -69, 15, 29, 46, 50, -15, 76, 28, -44, -5, 40, 44, 121, 52, 96, 103, -65, 72, - -78, -117, 113, -42, -18, 8, 90, -1, 110, 46, 39, 61, 111, 102, -38, -88, -104, -113, 48, - 80, -116, 65, 49, 106, 28, 119, -96, 111, 86, -128, -82, 62, -125, 25, -32, 9, -47, 88, - 98, 38, -117, 48, -77, -12, 104, -110, -90, 64, 102, 44, -28, 122, -79, -29, -114, -73, 99, - -94, -29, 54, 16, 91, 4, -121, -88, 119, 51, -27, -110, -65, 32, 55, -105, -79, -101, 121, - 90, -46, -53, -126, -76, -73, -22, 76, -1, 30, 74, -100, 107, 37, 88, 89, -3, 45, -118, - 81, -6, 123, -54, -90, 73, 24, -93, 10, -22, -6, -23, 47, 82, 39, -100, -3, 34, 12, - -49, -96, -23, 7, 109, -98, -99, -45, -4, -57, -108, 96, 18, 83, -25, 40, -51, 31, 30, - -24, -29, -45, 118, 74, -19, -117, -60, -116, 114, -80, 91, 2, -3, 86, -82, -57, -15, -29, - -119, -68, -59, 81, -53, 44, -97, -20, -38, -96, -46, 103, 39, 30, -75, -92, -104, 73, -126, - 70, -50, -93, -15, -34, 43, 74, -70, 27, -81, -91, 0, -20, 21, 34, 7, 110, 47, -19, - 124, -8, -58, -122, 94, -69, -118, -80, 14, -26, 67, 5, -63, 109, 37, 26, -21, -46, 43, - 25, -62, -121, -2, -53, 102, 24, -119, 17, -15, -33, -118, -117, -83, -85, -32, 81, 71, -47, - 54, 53, -122, -70, 48, -87, 108, 92, 96, 95, -78, -89, 28, -91, 122, -121, -57, -55, 8, - -51, -47, -15, 20, 111, -8, 72, -128, -107, -73, 53, -68, -6, 78, -80, -111, 123, 13, 3, - 81, 1, -18, 121, -10, -30, -98, -36, -44, 102, -102, 11, 21, 117, -113, -73, 0, -72, -50, - 47, -56, -104, -71, 102, -37, 29, -115, -40, 87, -3, -37, 21, 41, -109, -95, -53, 0, -6, - 4, 57, 30, -74, 28, 55, -71, 67, -77, -6, 67, 76, -32, 87, 98, 52, 98, -91, -35, - 49, -26, 21, -88, -93, -75, -41, 105, 54, 11, 92, 57, 55, -33, -128, 84, 40, 28, -12, - -23, 45, 84, -40, 56, -21, -7, 64, -36, -77, 113, -109, 104, 120, -16, -42, -48, 33, 66, - -69, 62, 2, 117, 45, 82, 41, 60, 39, -76, -63, -9, 123, -123, -16, -111, -23, 107, -45, - 43, 25, -77, -12, -28, -44, 124, 2, -79, 79, -59, 89, -28, 100, 75, -43, -12, 88, -62, - -7, -82, -90, 35, -50, -99, -29, -3, 59, -56, 72, -44, 42, -42, -17, 6, -31, 70, -92, - 85, -73, 107, -42, 10, 114, 58, 12, -61, 97, -66, 26, -127, 91, -75, -10, -76, -58, 43, - -16, 38, 50, -76, -71, 103, 1, -56, 113, -101, -98, -4, 110, 106, 64, 93, 110, -26, 80, - -72, 45, 92, -116, 56, 47, -62, 3, 81, -17, -53, -57, 113, 106, 47, 122, -125, -111, -42, - 117, -98, -84, 21, 83, 63, 25, -59, 124, -40, -19, -74, 85, -120, 74, -48, -33, -61, 39, - -105, -15, -98, 15, -24, -63, 70, 67, -121, -124, -88, 70, 45, 50, 9, 73, 75, -7, 60, - -121, -116, -49, 52, 39, 20, -53, 106, -112, -33, 82, 55, -31, 103, -58, -44, -38, 48, -16, - -46, -53, 14, -61, 70, -104, 15, -98, -100, -51, 8, 98, -75, -11, -122, -51, 77, 20, -65, - 49, -59, 33, 57, 120, -88, 23, 17, -5, 83, 8, 107, -107, -88, -10, -125, -17, 17, -84, - -25, -60, -44, -78, -56, -69, 124, -102, 122, 121, 108, -13, 116, -42, 79, 86, -31, 78, 31, - 96, -57, -115, 44, 110, -69, -75, 17, 50, -36, 100, 125, 13, 84, 122, -100, 114, 29, -34, - -72, 89, 76, 54, -85, 2, 10, 56, 21, 52, -10, -20, -17, 55, 49, -86, 77, -14, 55, - -47, 74, 102, -90, -58, -12, -24, -22, -46, -59, -110, 110, 101, 75, -80, 10, -96, 2, 24, - 111, -109, -116, 14, -86, -106, -71, -77, 113, -100, -108, 40, -20, 25, 3, -36, 64, 125, -64, - -5, 99, -118, 99, -83, 106, 32, 15, 50, -119, -72, 33, -100, -105, -42, 45, -31, 110, -112, - 34, -81, -87, 42, 111, -106, 62, 9, -71, 67, -45, -82, -8, -45, 14, -31, 65, 25, 76, - -122, 46, 122, -40, -84, -101, 79, 78, 65, 64, -52, -10, -128, -21, -16, 121, -38, 79, 27, - -57, 124, -104, 30, 80, 112, 104, 56, -97, -41, 113, 37, -73, 5, 1, 84, 90, -91, 96, - 1, -103, 64, 25, 67, -20, 13, 98, 40, -95, -93, 89, -56, -52, -109, -86, 94, -90, 57, - -67, -30, -118, -48, -70, 66, 109, 12, 67, 68, -34, -88, 81, -128, 123, 89, -51, 117, 47, - 87, -66, 115, 74, -102, -95, -85, -97, -16, 57, -15, -95, 40, 10, 50, 108, -1, -97, 86, - 72, 57, 94, 8, -16, -107, 4, -34, -47, 102, 17, -27, 44, -8, 7, 110, -99, -26, 61, - -3, 0, 18, -53, -2, -63, 7, -90, -3, 68, -19, -19, 101, 50, -38, -7, -113, -91, -117, - -116, -95, -107, -128, -13, 70, -120, -72, -110, 16, 17, -54, -126, 124, -78, -69, 21, -119, 86, - 75, 124, 17, -74, 35, 120, -61, -116, -50, -66, -72, 11, 124, 44, -115, -125, 38, -43, -36, - -6, 2, 83, -111, -48, 82, 125, 41, 95, -15, 38, 73, -65, -37, 41, 122, 31, 10, 66, - -92, 87, 28, 86, 57, 60, 39, 28, 71, 73, -111, -60, 16, -59, 66, 59, 83, -88, 116, - 87, -126, -5, -102, 10, -126, 11, 83, -79, 89, 38, 9, -47, -18, 39, 69, -95, -85, 40, - -96, 2, -69, -88, -93, 121, 41, 79, -88, 42, -72, -33, 39, 95, -51, 66, -85, -43, -67, - 33, -122, 78, -8, 59, -82, 95, -42, 26, 67, 42, 122, 2, -36, 64, 48, 66, -127, -87, - -99, -55, -82, 43, -36, -50, -1, -94, 19, 55, -52, -92, -100, 90, 114, -94, -39, -86, 111, - 71, -26, -20, -67, -56, 102, -77, -33, 78, -127, -90, 118, -96, -91, 117, 20, -4, -104, -8, - 3, 62, 43, 110, 102, -59, -34, 40, 70, 73, 51, -23, 106, -85, 93, 116, -54, 11, 101, - 58, 55, -50, -1, 85, -38, -89, 12, 107, 85, -92, 116, -23, 13, -37, 45, -51, 40, -122, - -126, -35, -112, -92, 15, -125, -8, -48, 17, 11, 111, -41, 69, -125, 32, -96, 93, 93, -24, - -11, 94, 115, 116, 28, -99, -96, 111, -80, 14, -3, 46, -43, -59, -110, 81, 102, -57, -128, - 4, -86, 122, 95, -1, 18, -29, -121, -29, -62, 18, 105, 33, -19, -69, -41, -97, 87, 80, - -85, 44, -6, 51, -102, -94, 15, -76, 65, 36, -113, -95, 84, 85, -2, -85, 26, -69, -43, - 5, 21, -80, -73, 1, -93, -24, -112, -94, -2, -94, -39, -111, 107, -35, -51, 66, -21, -105, - 13, 39, 110, 7, -112, 14, 24, -13, -107, 83, 21, 42, 20, 109, -37, -78, -70, 29, -63, - 70, -9, 12, 1, -1, 91, 30, 119, -59, -9, -125, -66, 104, 122, -46, -66, 8, 42, 48, - 21, 58, -27, 8, 53, -60, -34, -37, -106, 85, 11, 80, 112, -120, 86, 12, -1, -116, 2, - -117, -117, 95, -83, 123, -103, -98, -61, -73, -56, -69, 23, 26, 44, -9, 30, 54, -81, -39, - -126, -36, -57, -39, 55, 67, 109, 38, 32, 75, -40, 88, -112, 30, 80, 100, -46, -101, 58, - -34, -6, -111, -104, -8, -67, -107, 56, 19, -61, 70, -5, -115, -45, -34, 89, 48, 102, 104, - 89, -80, -77, 108, -99, -19, 64, 98, 119, -7, -102, 29, 117, 59, -2, -1, 119, -15, 0, - -126, 74, -82, -22, 85, 48, 107, 46, 54, 1, 22, -67, 28, -85, 50, -99, 103, 10, -98, - 120, 50, 102, 5, -100, -113, 41, -42, -67, -95, -117, 91, -24, 8, 91, -36, 86, 101, -32, - 76, -23, 111, -11, 48, 88, 30, 110, -9, -125, -111, -9, 29, 47, -121, -116, -51, -30, 114, - 22, 33, 102, -111, -45, -15, -28, 50, 11, -24, -97, -73, -1, 91, 62, -99, -95, 84, 126, - -119, -128, -78, -36, -56, -93, 83, -50, -22, -11, 52, -88, 82, -110, 37, 110, -128, 65, 7, - -75, -2, 86, 86, 32, -13, -57, 36, 120, 92, -10, 113, 114, -25, 19, -10, -31, -7, 113, - 56, -85, 0, -48, -125, 49, -88, -18, 11, 88, -30, -57, 98, -82, 25, -121, 95, -91, 75, - -55, 17, -57, 40, -124, -66, -17, 82, -76, -5, 79, 121, -119, 56, -56, 15, 64, 68, -77, - -55, -26, -109, 35, 116, 103, -6, -90, -81, -57, 45, 33, 75, -22, 38, -75, 92, -59, 25, - -56, -12, -105, -88, -72, -120, 2, 34, -7, -115, 9, -122, 26, -128, 53, 1, 100, 122, 84, - 90, 54, -70, 115, 41, 121, 115, 71, -7, 59, -76, 74, -71, 10, -48, -48, 94, -92, -30, - -87, 36, -1, 77, -58, -23, 79, 24, -16, -72, 63, 113, 60, 12, -44, -54, -85, 79, -47, - -30, 102, 105, -65, -65, -61, -103, -75, -53, 48, -114, -114, -15, -39, -114, -71, 17, 93, -21, - -20, -114, 41, 67, -112, -22, 30, 12, -66, 113, 17, -91, -11, 66, 11, -26, 72, -127, -109, - 13, 9, -41, -31, 29, 51, 23, 40, 80, -118, -121, 42, -13, 71, -43, -63, 59, 87, -74, - 82, -20, 125, 119, 14, 28, 121, 37, -12, -38, -125, -31, -12, 117, -107, 23, -33, -115, 100, - -120, -61, 22, 120, -122, 90, -65, -40, 101, 84, -74, 89, -111, -86, -54, 111, 11, 10, -90, - 86, 26, 28, -97, 17, -58, -126, 82, 112, 37, -24, 89, 98, -64, -122, -56, -32, 122, 111, - -45, -71, 37, -100, 52, -43, 83, 49, -44, 108, 17, 43, 43, -38, 115, -6, 9, 113, -30, - -4, -43, -63, 125, -65, -127, 64, -73, 90, -87, -53, 67, 17, -106, -56, -106, -56, -36, -57, - -33, -23, -11, -47, 29, -119, 25, -86, 73, 82, -69, 55, 55, 77, 42, -58, -101, 65, -79, - -67, 112, 64, -100, -6, 96, 62, 2, 34, 35, -2, 59, -64, 120, 25, 106, 1, 103, 5, - -4, 76, -19, -71, -3, -117, 15, -15, -84, 30, 25, 24, -23, 73, 114, 93, -107, -26, -109, - -126, -124, 116, -2, 58, 104, -37, 77, 105, -65, 60, 7, 93, -75, -56, 84, 83, -12, -69, - 5, 62, 103, -3, -11, -35, -105, -8, 71, 112, 31, 13, -33, -35, -93, -65, -63, 32, 84, - 6, -8, -70, -80, -115, 89, -13, -12, -62, 47, 43, -48, -107, -100, 37, 80, 103, 14, -1, - -82, -4, -33, -26, -33, 111, -71, -56, 55, -62, 4, -47, 63, 107, 46, -61, -11, -24, 2, - -121, 22, -90, -82, 23, 125, 19, -52, -16, 115, 31, -104, 98, 61, 44, -51, 67, 99, 12, - 36, -7, -112, 7, 7, 39, 106, -122, 92, -4, 99, -13, -92, -97, -101, -45, -33, -121, 10, - -3, -78, 104, 87, 115, 108, -34, 26, -41, 87, -48, -90, -106, 118, -110, 120, -24, 118, 56, - -47, 104, 5, 22, 51, 98, 1, 52, -128, -111, -91, -1, 71, 106, 17, -124, 86, 2, 24, - -97, -86, 18, 27, 93, -124, 100, -122, -12, -73, 62, -94, -22, 4, -76, -100, 38, -103, -83, - 30, 65, 94, -36, -97, -105, -29, 117, 67, -93, -110, -82, -60, 53, 11, 26, -9, 81, 24, - 79, -111, -66, 61, 117, 99, 119, 83, -55, -16, 80, 3, -40, 7, 2, -84, -54, -44, 0, - 104, 3, 120, -81, 35, 40, -48, -69, -28, 81, 18, -49, 8, -117, 88, 45, -10, 69, 29, - -38, -116, -3, 44, -119, 10, 28, -24, 8, -1, -15, -84, -81, 52, -101, -43, -57, 15, 15, - 77, -5, -99, 86, -63, -40, -112, -17, 66, -14, 121, -13, 102, -30, 114, 110, -87, 29, 111, - 47, 15, 41, 5, -103, 116, -40, 46, -5, 68, 99, -26, 57, 45, -108, 31, 114, -108, -113, - 115, 52, -108, -47, 116, 51, 3, -111, 25, 2, -100, -2, 17, 10, -87, -5, -110, 92, -112, - 81, 2, 49, -89, -53, 91, -114, -67, -87, 3, 44, 13, 50, 86, 89, 85, 32, 113, -94, - -127, 103, -47, 20, 126, -42, 53, 21, 8, 110, -81, 109, -87, 117, 33, -124, -71, 111, -81, - -110, 112, 24, -23, -55, 40, -41, -67, -20, 112, 110, -101, -40, -25, 102, -97, -22, -1, -107, - -54, -31, 76, -81, -52, 22, 3, -18, 62, -46, -123, 34, 117, 13, -115, -24, -12, -102, -67, - 58, 93, 120, 108, 5, 63, 83, 31, -11, -70, -30, 1, 90, 26, 27, -125, -125, -96, -51, - 76, 103, -95, 60, 113, -27, -100, -70, 81, 34, 60, 106, -44, -86, -49, 47, -35, 59, 89, - 35, 40, -63, -112, -60, -9, -49, -85, 78, -123, 106, -13, 110, -101, -32, -102, -49, -11, 31, - -6, -49, 100, 61, 47, 0, -90, 94, -37, 122, 3, 80, 13, 121, -112, -62, -57, 107, 44, - -23, 52, -18, -54, 27, -110, -77, -102, 51, -96, 26, -30, -74, 93, -45, -15, -29, -104, -2, - -126, 69, -11, -40, 89, -74, 101, 91, 15, 5, 25, 27, -10, -51, -126, 98, 88, -43, 33, - -74, -16, 29, 32, 82, 27, 34, -118, 34, -96, -7, -86, -113, -40, -100, -121, -80, -125, -110, - 117, -16, 32, 78, 80, -89, 61, 44, -116, -75, 30, -87, 22, 122, 113, 71, 53, -51, -92, - 100, 77, -24, 122, 22, 36, -5, 56, 91, -46, 41, 45, 106, 98, -4, 71, -82, -73, 57, - -5, 0, -15, 36, 71, 66, 6, 26, 103, -67, -113, 74, -105, 9, -24, -26, 84, 14, -34, - -109, 41, -25, 26, 2, -19, -53, 82, 83, -61, 105, -113, 27, 114, -89, 111, -32, 103, -85, - -118, 51, -28, 11, 1, 94, 34, -120, 31, 80, 2, -10, 86, -32, 71, -42, -122, -2, 80, - -82, -86, -54, -117, 111, -115, 116, -32, -31, -17, 17, -29, -99, -71, 37, 62, -42, 40, -112, - 96, -70, -112, -119, 97, 119, -28, 105, 117, 8, -98, 62, -114, -10, -4, 115, -39, 72, 64, - 17, 28, 109, -49, -48, -97, -119, -84, -95, -19, 22, 7, -76, 41, 16, 122, -58, -2, -116, - -85, 93, -28, 84, -10, -118, 22, -27, 96, -56, 71, -109, -15, -77, 46, -20, -94, 6, -113, - -107, -3, 35, 24, 21, -15, 52, -83, -2, 88, 74, -86, 126, 70, 39, -89, 80, 56, 26, - -69, 11, -71, 56, -101, 93, -97, 13, -93, 67, 24, -18, -9, 10, 104, -35, 25, -76, 98, - -68, 46, 12, 120, 1, -124, -49, 34, -42, 107, 36, 100, -29, 77, 1, 48, 48, -112, -96, - 20, 12, -77, 81, 38, -83, 67, -61, 40, 90, 96, 83, -78, -33, -53, -29, 41, -36, -71, - -69, 24, -9, -78, 76, -45, 48, 17, -24, -28, -20, -25, 75, -50, 4, 110, -57, 57, -29, - -128, 126, -63, -25, -73, -108, -86, 18, 78, 66, -47, 122, -28, -113, 77, 62, 82, -75, 29, - 50, -111, 55, -88, -19, 100, 42, 52, 2, 38, -109, 89, 110, 17, -8, 10, 10, 54, -8, - -73, -100, 99, -95, -73, -42, -65, 122, 122, 53, 122, -124, 36, -78, 81, 95, 45, -89, 2, - -113, 98, 19, 67, 35, -40, -82, -101, -111, 72, -26, -125, 31, 80, 124, -89, -35, -109, -120, - -126, 34, 59, -39, -102, 40, -59, -101, -43, 13, 5, -31, 23, 9, -92, -64, -97, -9, 110, - 14, -93, -77, 15, -80, 114, -8, 53, 85, 123, 51, 108, -90, 81, -17, -13, 63, 78, -26, - -4, 7, -110, 14, -111, -95, -95, 7, 99, -89, -59, -48, 43, 27, 110, 97, -113, 12, 62, - -48, -16, -53, 44, 24, -34, 55, 81, 23, 42, -113, -98, -51, -27, 33, 112, 55, -22, -122, - -65, 45, -68, -64, 101, -120, 37, -114, 115, -88, 77, 28, -113, -75, 2, -100, -57, 126, -45, - -96, -79, -34, -28, -68, -105, -34, -103, 24, 124, -37, -2, 61, -9, -9, 24, -9, 89, -102, - -15, -19, -34, 31, -78, -87, 10, -80, 20, -24, -88, 73, 20, -73, -98, -38, -34, 117, 48, - -66, -91, 66, 56, 40, -27, 11, -113, -102, -64, 114, 13, 17, -57, -87, 35, 38, -41, -38, - 101, -20, 62, 121, 13, -21, 26, -65, -49, 88, -124, 29, 63, -100, 13, -102, 81, -32, -100, - -79, -46, 72, -47, -123, 56, 5, -14, 126, 80, -61, -105, -21, 51, 43, 28, -22, -128, -73, - 32, -50, 12, -127, -80, 41, 0, -41, -85, -28, 25, 103, 62, -68, -70, 62, 10, 3, 26, - 39, -92, -63, 3, 53, -49, 80, -97, 9, 71, -24, -25, -44, -29, -74, -51, -6, 4, 87, - 101, 83, 43, 80, -37, -9, -65, -108, -54, -17, 117, -109, -30, -50, 75, -24, -50, -19, -5, - -106, -116, -19, 117, 65, -46, -31, 88, -65, -91, 27, 18, -24, 110, -50, 90, -56, 125, 102, - 113, 77, 54, -35, -9, -32, 121, 8, 87, 31, 7, -117, -18, 107, 109, -127, -124, -34, 13, - -35, 49, 97, 118, 67, -64, -96, -93, -95, -38, 112, 13, -103, -68, 39, -35, 25, -24, 116, - 14, 116, -88, -109, 26, -10, 82, 42, 32, 82, 110, 19, -92, -77, 31, 86, -23, -64, 2, - -50, 63, 28, 2, 88, -101, 88, -114, -58, 92, 79, -63, 17, -60, 101, 34, -95, -35, -59, - 33, 63, -60, 81, 41, -61, -83, -25, -13, -27, -59, -127, -33, 62, -68, 117, -106, 98, -7, - -85, 90, -52, 89, -115, 6, -91, -74, 111, -44, 23, -29, 114, -81, -77, -117, -97, 88, -3, - 1, 51, 71, 63, 108, -57, -122, 102, -100, -111, 94, -45, 82, 57, -92, 16, -9, -124, 76, - -115, 51, 44, 84, -11, 40, 110, 110, -87, 49, -82, 24, -83, -100, -102, 58, -127, 19, 42, - 116, -117, -64, 47, -57, 44, 34, 40, -31, -48, -63, 102, -7, -67, 1, 59, 4, -77, 52, - -76, 45, -46, -25, 39, -19, -45, 102, -81, 36, -69, -109, 70, 123, -114, 124, -40, 34, 79, - -58, -84, -94, 45, -96, -60, -34, -28, 9, 8, 125, 89, 69, 114, 48, -72, -89, -9, -4, - -83, -99, 106, 24, -126, 104, 10, 6, -78, 11, -126, -36, -4, 49, -68, 64, 86, 94, 25, - -40, -117, -80, 3, 89, -80, -25, -27, -31, -97, 122, -97, 115, -110, 91, -42, 45, -81, 71, - -75, 50, 30, -14, -11, 80, 54, 13, -23, 110, -71, 80, -10, -15, 9, 21, 70, 50, -38, - 97, 22, 25, 81, 31, -75, 111, 40, -73, -71, -21, -24, 27, 7, 32, -55, -88, 109, -45, - -93, -95, -85, -68, 87, 125, 54, 66, -16, -91, 34, 83, 84, 31, -31, -8, 100, 54, -39, - 116, 33, -45, 120, 41, 105, 28, -11, 66, 112, -62, 121, -4, 60, -78, -40, -119, 99, 66, - -10, -55, -94, -14, -60, -127, 12, -83, 7, 88, 70, -57, -36, -64, 30, 104, 72, 98, -113, - 23, 66, -112, 38, -29, -90, -79, -30, 91, -108, -49, -112, -126, -11, 112, 113, 85, -111, -38, - -15, 77, -13, -29, 48, 103, 29, 34, -3, 32, 96, 68, 109, 106, 29, -92, -39, -78, -114, - 25, -111, -6, 74, -75, 78, 44, 65, -17, 6, -77, -22, 44, 117, -36, -29, -58, -35, 43, - 95, -39, -91, -46, 8, 3, 68, 80, 9, -108, -91, 112, 73, -44, 46, 3, 30, -59, -87, - -6, 10, -86, -87, -107, 50, -88, 95, 122, -31, -114, 99, 42, -122, 92, -125, 84, 115, -109, - 119, 70, -94, -126, -25, 64, 26, -116, -36, -15, 26, 18, -70, -91, 50, -5, 122, -110, 33, - -92, 59, -118, 95, 48, -115, -53, 107, -14, 114, 35, -29, 90, 56, -91, -120, -52, -15, -52, - -14, -85, -67, -106, -36, -82, -112, -106, 4, 33, 126, -89, 21, 56, -14, 18, 77, 61, -13, - 9, -78, -44, -77, 6, -58, 117, -75, 50, 14, -92, -23, 88, -2, 96, -9, 46, -22, -125, - 112, 67, -48, 59, -102, -19, -50, 31, 67, 125, 69, -90, -82, 40, -18, -53, -31, 40, 64, - 83, 69, 112, 43, -80, 39, -66, -53, -1, -5, 116, 21, -93, -106, -113, 108, 5, 98, 49, - 97, -28, -36, -78, 58, 21, -94, -71, -9, 66, -118, 76, -28, -24, -98, -6, 105, 25, 107, - -17, 54, -86, 111, -43, -6, 76, -32, -115, 33, -85, 0, 86, -7, 45, 13, 80, 43, 40, - -71, 53, 40, 84, 53, 47, -6, -102, -34, -9, 79, 71, 75, -85, -62, -74, -5, -87, -57, - 88, -49, -20, 25, 11, 113, -40, 99, -41, -25, 114, 97, -71, -31, 45, -44, -31, 46, -118, - -74, 78, 17, 95, 63, -26, -10, 69, -72, -29, -25, 40, 83, -64, 80, -65, 91, -104, 56, - 34, -54, -43, -52, 32, 66, -15, 15, 86, 9, -29, 41, -85, -39, -70, -66, -58, -38, 72, - -115, -97, 65, -127, -73, 105, -1, -20, -73, -20, 96, -89, 65, 3, -53, 125, -89, 30, 89, - -112, 63, -38, 35, 67, -83, -108, -106, -11, 109, -69, -79, -127, 59, -74, -103, 17, -54, -12, - -43, 1, 99, 42, -71, -105, -113, 126, -27, -39, 54, 94, -40, 125, -26, 7, 90, 46, 56, - -61, -50, -21, -108, -29, -30, -17, -101, 16, 48, -15, 46, 86, -56, -96, 28, -75, 8, 124, - 97, 48, -17, 87, 95, -97, -13, -60, 96, 36, 82, -12, 51, -18, -81, -71, -111, -28, 51, - 66, 106, -104, 114, -10, 17, -115, -40, 84, 5, 2, -52, -16, -88, 33, 64, -64, 62, 113, - -62, 27, -88, -78, 72, 52, -56, -47, -106, 71, -81, -52, -87, 18, 20, 88, 122, 38, -76, - -48, -121, 59, -92, -22, -87, -120, 110, -89, -26, -64, -125, 116, -30, -123, -20, 93, 105, 67, - 89, 34, -90, -56, -127, -108, 72, 108, -41, -127, -64, 45, -107, 18, 12, -55, 109, 126, -88, - -62, -17, 64, 66, 102, -63, 84, 16, 73, -98, 34, -49, -62, 113, -105, -43, 1, -3, -76, - -106, 4, 53, -62, -13, 50, -102, -81, 124, -32, 66, 67, -33, -77, 17, -9, -50, -32, 113, - 121, -113, -43, -66, -37, -76, 11, -36, 16, 80, -31, -4, -36, 30, 23, 105, -18, 62, -32, - 110, 64, 4, 119, -98, -21, 78, -114, 102, -108, 112, -83, 10, 81, 84, -47, 65, 107, 5, - 110, 18, 36, -62, -101, 97, -87, -86, 2, 36, 61, -125, -60, 94, 114, -93, 67, 65, 100, - -62, 32, -118, 29, 68, 37, 107, 64, 9, 12, -87, -6, -47, -71, 70, -104, -18, 16, -15, - -8, 6, 76, 44, -104, -108, -116, -36, 47, -89, -42, -107, -47, -35, -29, -70, -120, -115, -83, - 23, 108, -69, 30, -48, -71, 90, 89, -114, -65, -37, 46, 71, -58, 85, 111, -101, -102, 115, - 83, -95, -14, -36, 67, -30, 52, 111, 36, 61, -55, 21, -13, -64, -28, -103, 89, -83, 109, - 50, -16, -90, -55, -122, 53, 92, -11, 38, -21, -118, 14, -110, 54, -80, -127, 83, -7, 124, - 28, -84, 52, -11, 79, -15, -19, 0, 5, -22, 82, -40, -108, -17, -120, 120, -81, -42, 41, - -119, 20, 11, 42, 111, -120, 23, 120, -116, -18, 69, -59, 75, -57, 101, -38, 107, 41, -124, - -110, -16, 77, -127, -91, 117, 63, 97, -101, -104, 43, 8, -55, 80, -14, -123, -105, 95, -56, - 55, 86, -128, 106, 102, 12, -16, 114, -99, -13, -85, 30, -79, 8, -82, 2, -80, -25, 28, - -96, -39, 124, -66, -75, -125, -94, -74, 15, 5, 41, -103, -33, 9, -22, -91, -2, 40, 8, - 117, -38, 126, 87, -15, 33, 35, 99, 100, 124, -101, -125, -76, 60, -41, 78, 10, -124, 110, - 109, 11, 34, -73, 107, -35, -85, 89, 35, -103, -113, 93, -43, -117, -64, -97, -68, -76, 19, - 101, -7, -46, -4, -89, -34, -96, 126, -2, -6, 28, -65, 32, 99, 121, 36, -12, 43, 84, - 11, 14, 41, -80, -78, -18, 119, 123, 106, 71, 55, 22, 69, 23, 16, 11, -105, -93, 17, - 15, 17, -50, 77, 70, 24, -109, 19, -110, -105, 22, 94, 8, 100, -73, -22, 54, -4, 8, - 9, -54, 60, 102, 18, 8, -16, -56, -61, 107, 19, 64, -79, 31, 61, 125, 36, 94, 97, - -113, 84, -110, -51, 76, -6, 0, -90, 16, -51, 90, -8, -25, -116, -109, -14, -28, 16, -28, - -8, 32, 75, 77, -11, 122, 17, -25, -76, -66, -122, -87, 27, -53, 111, 44, -95, -124, -84, - -109, -30, 45, -34, 50, 68, -122, 1, 52, 3, -82, -8, -86, -19, 20, -83, 6, 81, -36, - -128, -7, -31, -22, -49, 39, 37, -127, -87, -44, 110, 8, -39, 32, 20, -109, 95, 49, -17, - -41, -31, 62, 115, -45, 114, 66, -114, -109, -1, -47, -119, 4, -41, 123, 51, -97, -98, -127, - -18, 47, -93, -2, 10, -14, -16, -37, -24, 70, 66, -17, -116, -125, -114, 11, 112, -128, -75, - -69, 119, 109, -10, -41, 102, -61, 32, -118, -42, 50, -68, 103, 78, 15, 112, -112, -73, -127, - 122, 100, -75, -25, 48, -78, 54, 53, -8, -64, -65, 40, 110, -31, 101, 0, 1, -127, 54, - 80, 57, -115, 91, -46, -38, -123, -7, 102, 28, -48, 63, -35, -128, 57, -124, 119, 30, -77, - -117, -1, -46, -17, -37, -101, 10, -102, -120, -23, 98, -34, -22, 50, 29, 2, 73, 46, -32, - 118, 38, 17, -124, 34, 104, -60, 126, 97, -122, 73, 70, 55, -49, -54, 20, -45, 102, 41, - 93, 116, -89, -59, 108, 60, 89, -110, -5, -110, -30, 57, 25, 61, -70, 109, 79, 53, 37, - 37, 56, -120, -22, -111, -22, -120, 102, 71, 56, 89, -51, 21, 56, 9, 25, 102, -47, -13, - 114, -64, 79, 88, -112, 115, 123, 99, -18, 60, 116, 117, -11, -83, 100, -28, -75, 10, -105, - -30, 49, 9, 18, 93, -33, -117, -89, 30, -72, -10, -21, -37, -88, -121, -84, -12, -123, 27, - 50, -89, 62, 58, 107, 63, -5, 22, -95, 42, -66, -118, -97, -57, 85, -18, 43, 110, 84, - 17, 103, -30, -2, 2, -2, -59, 42, -59, 75, -81, 25, 75, 73, -109, 125, 70, -26, -68, - 72, -58, 119, 19, -39, -58, 55, -90, 123, -19, -106, -15, -18, 20, 17, 39, 80, -120, 27, - -122, 18, -39, 80, -32, 61, -125, 110, -83, -127, -31, -57, 69, -83, -99, 84, 121, -54, 31, - 39, -51, 20, -25, -82, -1, 99, 29, 94, 54, 63, 25, 112, -7, 106, 0, -114, -13, 15, - -128, 64, 11, 59, 33, 88, 9, -108, 126, -55, -45, -56, -24, 47, -100, -51, -121, -31, 54, - 6, -98, 44, 79, -114, 79, 80, 42, 114, -52, 125, 51, 58, -34, 57, 58, -126, -19, 79, - -109, -80, -80, 118, 81, -122, 86, -90, 32, -11, -128, 39, 84, -69, -117, 9, -45, -6, 107, - -41, 31, -8, -114, -86, 90, 57, 61, 109, 122, -50, 122, -14, 5, 74, 42, 5, 93, 90, - -48, -64, -105, 8, -98, 31, 31, 48, 0, -123, -110, 118, -121, -69, 15, 66, 58, 108, -66, - 114, -75, -38, -104, 125, -78, 91, 14, -39, -15, -119, -70, -29, 25, -103, -45, -59, -68, -31, - 114, -38, -65, -84, 95, 61, 77, 93, -49, 106, 21, 65, -123, 91, 73, -1, 95, -20, -122, - 52, 116, 42, 24, 119, -62, -111, -12, -51, 12, 4, 19, -61, -39, -58, 69, 115, 17, 27, - 41, 54, 66, -87, 83, 31, -114, 91, 29, 88, -121, 126, 108, 107, 118, 55, -35, -124, -46, - -64, 44, -100, 68, -128, -59, -21, -110, -72, 54, 95, -126, -58, 5, -57, 30, 111, -29, 9, - 100, -97, 80, -86, 61, 63, -7, -56, 22, -49, 92, 84, -117, 37, 112, 35, -19, 35, 82, - -71, 26, -101, 45, 24, -92, -4, 120, 74, -2, 117, -124, 86, 89, 57, 47, 87, -95, 79, - -118, -59, 35, -105, 44, 6, 93, -35, -121, 46, -74, -128, 91, -89, 14, -92, 48, -97, 18, - -26, 69, 82, 73, 61, 89, 90, -75, -124, 56, -78, -2, 107, -96, -33, 110, 31, -58, -8, - 107, -128, 55, -55, -91, 77, 111, -118, -68, 117, -118, 74, -128, 45, -106, -107, 17, -58, -88, - -63, 25, 101, -93, 23, -100, -106, -20, -4, -82, -76, 63, -100, 107, 2, -26, -33, 7, -30, - -95, 118, 67, -38, 17, -98, 25, -11, 47, 92, 106, 62, 110, 73, -124, -70, -18, -75, -97, - 66, 79, -99, 77, 123, -34, 73, -94, -98, -127, -66, -80, -19, -62, 74, 31, 25, 52, -107, - 103, 42, -108, -113, -1, 61, 54, 117, -101, 95, -21, 17, -126, 61, 113, -112, 95, 97, 75, - -65, 61, -110, -57, -16, -104, -110, 9, -123, -39, 93, 36, -125, -114, 92, 94, -112, -98, 97, - 126, 16, 59, 123, 49, -36, 9, -49, 90, -118, 94, -42, -74, -8, -110, 11, 14, -14, 0, - 6, -77, 66, -64, -52, 92, -29, 12, -97, -122, 52, -103, 40, -18, 45, -86, -23, -103, 120, - -33, -113, 14, -78, 9, -66, 104, 69, 4, 22, -94, -51, -11, -110, 45, 1, 117, 26, -18, - 82, -83, -83, -121, -123, 126, 106, 100, 122, 17, 114, -53, 31, -44, -97, -41, -9, -84, 123, - -88, -124, -82, -2, -46, -20, -1, 29, 91, -70, 115, 98, 67, 53, -77, -85, -40, 70, -43, - -104, 122, -68, -110, -36, 65, 82, -109, -112, 10, -11, -23, 91, -17, -66, -62, 124, -7, 5, - -89, -118, -85, 56, -117, -93, -127, -42, 94, -31, 57, -67, -56, -45, 74, -65, 32, -96, -55, - 114, 36, 82, -41, -97, 42, -21, -52, -42, -94, -82, -104, 20, 13, -79, 4, -33, -68, 114, - -72, -12, 112, 27, -103, -5, -79, 111, -35, -17, 36, -34, 81, 118, -49, 100, 30, -123, 45, - 48, 7, -3, -114, 39, 78, 2, -84, -71, 41, 49, 85, 88, -90, -12, -36, 87, -14, -101, - 105, -24, -41, 44, -70, 81, 88, 85, -122, -55, -128, -37, 75, -36, 28, 45, -18, 115, -109, - 123, -82, -103, 4, 69, 45, 38, -78, -67, -76, 42, 21, 91, -124, 3, -101, 102, 102, 96, - -34, 85, 93, -21, 116, 124, -55, 111, 99, 94, -11, -4, 77, 57, -58, 40, -90, 23, -25, - -78, -128, -88, 74, -64, -108, -112, -116, 95, 113, 111, -75, -48, -52, -112, -126, -24, -6, -124, - 47, -65, -58, -107, -32, -73, -52, 114, -53, -92, 90, 21, -42, -15, -76, 87, -93, -49, -38, - 83, -11, -7, 12, 27, -14, 85, 32, 24, -91, -22, 42, -95, 119, 115, -103, -91, 18, -59, - -34, 1, 12, -33, 17, 100, 57, -64, -61, -119, 98, -116, -124, -106, -71, 67, -85, -109, -119, - 51, 122, 95, -55, -100, -74, 86, 50, -107, -111, 65, -3, -36, -101, 116, -79, 39, 88, -127, - -109, -49, 78, -52, 89, -16, 92, -93, 6, 7, 37, 86, -69, 52, -120, 34, 27, 72, -28, - -95, -80, 116, 11, -81, 119, -17, 38, -115, 99, -80, -21, -53, -79, -82, -98, 47, -90, -3, - 126, -91, -104, -57, -6, 89, -38, -100, -94, -112, -19, 9, -17, -77, -128, 49, -50, -56, 26, - 24, -40, -41, -47, 60, 75, 92, 19, -110, -37, 9, 4, 116, -123, 32, 78, 3, 125, -108, - 119, 52, -98, -85, 14, -9, 109, -21, 56, -89, -32, -51, 69, 125, -2, 18, -45, 122, 70, - 47, 74, -65, -61, -100, -2, -72, 87, -37, 122, -49, 62, -73, 71, 69, -116, 63, 68, 16, - -39, 112, -64, -55, 28, -50, -75, 105, -35, -126, -30, -100, -40, -33, 106, -26, -55, 60, 88, - 24, 105, 83, -109, -74, 34, -112, -87, 23, -107, 46, -78, 44, -29, -5, 44, -128, -66, -30, - -81, -57, 102, -111, 126, 114, -98, 29, 72, -27, 55, 61, -15, -114, -103, 77, -66, 98, -24, - -62, 105, -9, -104, 56, 36, 105, 49, -68, 123, -53, 79, -96, 107, 70, -58, -67, -117, -41, - 88, -58, 29, -17, 91, -58, -1, -29, 98, 2, 70, -97, -43, 38, 26, 89, -38, 78, 38, - 69, -18, 20, 17, -43, -42, -35, 96, 2, 57, -19, -52, 121, 23, 12, -113, 94, 23, -113, - -116, -34, 17, -124, -44, 50, -51, -114, -6, 87, 30, -11, 86, -122, 86, -70, 10, -118, -111, - 67, -124, 71, 21, 88, -35, 109, 91, -38, -112, 33, 59, 27, -91, 7, -112, 15, 16, 110, - 22, -36, -51, 34, -92, 38, -24, 125, 86, -70, 74, -94, 101, -109, 47, -105, -46, 65, 99, - 22, -111, -17, 6, 9, -77, -70, 88, -106, -102, -82, 58, -56, 57, -31, 7, 121, -54, 66, - -106, 61, -119, 75, -84, 69, -4, -1, 4, 85, 109, -33, -48, -75, 81, -46, -8, 10, 52, - -45, 120, 120, -124, 35, -25, -106, -79, -97, -58, -52, 102, -19, 17, -30, -57, 108, -80, 111, - 113, 35, -12, -41, 51, 95, -107, -86, -92, -109, 62, -107, 10, -41, 11, -21, 50, 50, -89, - 31, 105, -49, 65, 57, 83, 98, -22, 31, 104, 94, -63, 120, -95, -88, 57, -31, -58, -56, - 21, 21, -38, -12, -3, 62, 64, -32, 51, -96, 112, -109, -118, -57, -106, 117, -82, 54, 102, - -4, 80, -40, 71, 98, -115, 111, 50, 98, 95, 59, 26, -48, -75, -76, 8, 6, 109, 86, - 53, 12, 126, 109, -101, 63, 71, 125, -90, -105, 88, 75, -21, -48, -93, -121, 82, -69, 35, - 79, -16, 0, 9, -14, 34, -15, -60, 84, 113, 77, 30, -120, 38, -67, 31, 85, -22, -5, - -37, -65, -43, -41, -32, 7, 122, -57, -51, -34, 12, -111, 47, -57, 39, 47, 126, 73, 116, - -57, 46, 72, -61, -85, 73, -73, 92, -25, 118, -21, 33, 64, 40, 111, 41, 95, 20, -78, - -74, 92, 90, 4, -107, -75, 21, -9, -59, 90, -107, 96, 61, 57, 33, -26, 53, 96, -124, - -7, -76, 48, -85, 74, -127, -17, -7, -37, -91, -12, -55, 4, 109, 92, -61, 4, 62, -22, - -24, 60, 59, 98, -21, -66, 105, 8, 116, 93, 97, -91, -80, 6, -88, -46, -35, 69, -104, - -9, 91, -55, 114, 110, 56, 80, 10, -41, -42, -78, -15, -14, 95, 84, -71, 95, 91, 112, - 125, -115, 109, -54, 105, -42, 14, -48, 47, 116, 29, 43, -21, 27, 57, 108, -46, -2, 91, - 26, -119, 125, 99, -79, -3, 119, 10, 110, 106, 101, -40, 83, 110, 23, -77, -89, -71, 39, - 58, -54, -80, -101, 77, 63, -17, 4, -40, 64, -114, -54, -83, -37, -45, -48, -109, -25, 110, - 43, -94, 53, -119, -63, 32, -95, -27, 67, 99, 62, -103, -119, 77, 10, 12, -26, 72, -76, - 7, 45, -85, -26, 1, 126, 111, 102, -68, -85, -96, 28, -1, -5, -123, 6, 120, 126, 6, - -39, 125, -98, 116, -11, -111, -120, -106, -125, -127, 67, 57, 57, -48, -78, 98, 121, -94, -111, - 29, 25, -59, 36, -63, 125, -60, 71, -24, -66, -29, 38, 60, -114, 39, -68, -20, 87, 68, - -122, 83, 16, 31, -106, 13, -69, -61, 1, 87, 48, -56, -103, 26, 101, -53, -41, -13, -119, - -30, 102, 35, 88, 33, -27, 37, 18, 104, 31, -29, 78, -115, 15, 92, -26, 107, -69, 56, - -66, 90, 10, 102, 51, 93, -99, -49, 5, 99, -11, 89, 103, 121, 65, -92, -54, -43, 110, - -49, 23, -76, -81, 5, -120, -54, 57, 49, -119, -112, 61, 16, 115, -19, -71, -41, -71, -11, - -59, 120, -94, 31, 43, 15, -98, 11, -43, 59, -47, 92, 16, -103, 118, 102, -2, 98, -83, - 116, 6, 34, 57, -53, -75, -63, 63, -111, 56, 24, -62, -59, 100, 25, -116, 108, 125, 9, - -52, 68, 28, -84, -102, -47, 27, -75, 90, -48, -78, 68, -59, 114, -110, 70, 1, -68, -81, - -32, 102, 115, -106, -125, -50, 70, 11, 108, 2, -125, 56, -3, 74, -46, -62, -72, 84, 20, - -77, 68, -77, -122, 110, 74, -5, -117, 26, -113, 121, -35, -21, -109, -78, 28, 61, -74, -28, - -23, -15, 101, 66, -126, -27, -2, 62, 36, -94, -121, -10, -32, -84, -7, -30, -27, -37, -13, - 89, 96, 80, -32, -21, 93, 113, 100, 82, -81, 1, 94, -119, -54, 107, -128, 74, -111, -104, - 96, 102, -102, 7, 1, 10, -63, -30, -60, -32, 85, 56, -109, 18, 53, 16, 81, -80, -93, - 55, 89, 45, -6, 85, -113, 121, 72, -34, 25, -30, -47, 27, -106, -41, -111, -10, 17, 103, - -32, -8, -45, -101, 22, -53, 42, -67, -42, 112, 79, 22, 95, -30, 73, 104, 20, 24, -7, - 93, 86, 123, 48, 82, 22, 13, 118, -44, -98, -66, 14, -123, 101, -93, -104, 106, -64, 36, - -120, -119, 90, 56, 10, 61, -8, 104, -87, 13, -48, 97, 90, 25, 62, -61, 81, -105, -80, - 65, -48, -51, 47, 75, 91, 9, 73, 56, -34, 93, -31, 32, 14, -37, 78, 76, -124, -57, - 14, -115, -63, -104, 70, 32, -44, -118, 28, 85, 46, -50, 120, 27, 89, -74, 82, -50, -119, - -119, -74, -101, -99, 46, -75, -108, 27, 72, -31, 113, 4, 126, -125, -1, -27, 124, -42, -90, - -21, 82, -73, 25, 87, 11, -71, -108, -48, 126, 112, 112, 55, -56, 123, 92, 52, 4, 27, - 20, -104, -27, -12, -109, 62, -20, 14, 19, 34, -96, -5, -46, -24, 31, 112, -126, 32, 102, - 75, 10, -59, -105, 72, 69, 59, 100, 61, -115, -28, -114, 111, -28, -106, 113, 61, -63, 82, - -42, -39, -122, 104, 105, -128, 86, -104, -7, -109, -46, -86, -25, -70, 96, 122, -128, 11, 45, - 106, -101, 89, 74, 36, 79, -47, 99, 70, 26, 18, -128, -35, 34, -59, 27, 64, -94, 53, - -90, 17, -126, 26, -3, -26, 112, -128, 35, 88, -47, -7, -53, -64, 34, -93, -38, -30, 93, - -127, -112, -124, -116, 122, 17, 107, 12, 17, -1, -77, -74, -76, -42, 1, 107, -85, -107, -43, - -44, 115, -32, 58, 41, 32, -127, -122, 6, -109, 48, -39, 100, 95, 53, 56, 64, -86, 44, - -84, -116, 31, -10, -48, 112, 115, 46, -8, -93, -116, -81, -86, -56, 18, 76, -15, -13, 57, - -81, -98, 67, 28, -28, -89, -79, -79, -96, -117, 14, -65, 16, -46, 37, 126, 61, 35, 61, - 23, 10, -18, 56, -57, 77, 55, 121, 3, -27, 17, -21, -58, -70, 95, -97, 117, 67, 78, - -49, 41, -63, 57, -95, -2, 68, 125, -102, 114, -61, 1, 77, -50, -52, 67, 122, 20, -26, - -22, 9, -93, -117, -99, 57, 80, 20, -42, -73, -25, 48, -95, 116, 83, -115, 22, -42, -32, - -100, 98, 113, -107, 67, -74, 32, 69, -18, 91, 23, 91, 58, 117, 31, 113, 3, -104, 60, - 46, -11, 109, 37, -112, 8, -88, -20, 41, 84, -76, -121, 52, -62, -30, -67, 101, -91, 2, - 43, -51, 60, -45, -88, 2, 9, -102, 83, 25, 105, -72, 64, 23, -32, -29, 88, 83, 108, - -65, 47, 106, -96, -10, 110, 78, -2, -89, -76, -119, 70, -39, 80, -52, 84, -67, -113, 114, - -33, -47, 79, 79, -106, 118, 21, 83, 36, -57, 16, -9, 112, 96, 57, -99, 58, 75, -96, - -81, 44, 18, 122, 45, 64, 53, -69, -35, -128, -113, 37, 50, -4, 123, -105, -88, 70, -11, - 52, -26, 20, -56, 2, 78, 111, -116, -63, -22, 89, 114, -49, 117, 16, -61, -33, 43, -1, - 5, 58, -101, 11, -28, -119, 25, 89, 4, 39, -68, -13, 10, 84, 60, -32, -94, 125, -30, - 85, -2, 8, 119, -72, -32, -108, 51, -45, -51, -9, -107, 33, 80, 119, 105, -24, -45, 68, - 41, -21, -66, -121, 106, 68, -104, 72, 64, -35, 98, 30, -45, 125, 2, -13, -109, 51, -59, - -37, -16, 121, -109, 92, -27, 21, -109, 123, -114, -60, 0, 26, 13, -87, -39, 10, -17, -107, - -76, -117, 45, -76, 22, 5, 71, 13, 120, -84, -96, 12, -102, -67, 113, -94, 75, 23, 51, - 52, -21, 52, 20, -12, -119, -56, -38, -55, 30, 125, 104, -74, 58, 48, -99, -97, -87, -126, - 120, -86, -51, -111, -6, 44, 76, 101, 123, 103, 121, 59, -89, -31, -9, -17, -68, 64, 47, - -15, -17, -121, 12, -111, -74, 86, -11, 25, -54, -18, 108, -85, -6, -92, -116, 43, -84, -82, - 49, 95, -7, -95, -104, -33, -26, -121, -99, -1, 6, 40, -56, -5, -87, 34, 18, -13, -79, - 38, 95, 71, -12, 88, 111, -121, 119, 1, -6, -98, 13, 106, 126, -60, -42, -61, 111, 3, - -91, 107, -20, -66, -77, -124, 123, 93, 90, -19, 38, -85, 79, -52, 100, -116, -57, 100, -23, - -36, 72, 29, 92, 94, 47, 23, 30, -103, -34, 50, -116, 95, -86, -52, -1, -2, -65, -14, - -126, 84, -19, 96, -117, 49, 9, 58, 70, -53, 6, -38, 117, -38, -64, -15, -1, 2, -88, - 114, -95, 49, -75, -46, 62, 102, -91, -71, -71, -26, -46, 71, 15, -21, 102, -93, -38, -67, - -101, 63, 91, -74, 31, 34, 40, -95, -22, 125, -36, -118, 120, 108, 2, -9, 24, -108, -11, - 94, 15, 28, -63, -64, -28, -30, 72, 42, 98, -107, -125, -24, 94, -124, 95, -96, 104, 35, - -126, -8, 70, 24, -21, 84, -20, 59, 43, 70, -44, 7, 106, -47, 10, 52, -39, -94, 23, - -24, -64, 56, 0, -42, -114, -25, 59, -76, -77, -13, -18, -47, -102, 49, 71, -51, 70, -56, - -55, 54, -83, -97, -77, -38, 86, 38, 41, 85, -57, -7, -37, 37, -124, 8, 93, -1, -116, - 106, 19, 81, -119, 72, -76, -101, 82, -52, -92, -127, -84, 28, 68, -128, -30, -79, 75, 21, - -110, 59, 32, -121, 48, -59, -43, -58, -24, 48, -118, -101, 118, 15, 28, 79, 125, 45, -76, - 22, 113, -68, 100, -69, 50, -105, 30, -51, -6, -127, -82, -114, 48, 5, 105, -38, 21, -55, - 24, -14, -126, 3, 107, -107, 84, -73, -101, 114, -87, -69, -104, 105, -94, -18, 87, 119, -62, - -7, 3, 41, 16, -59, 86, 42, 114, 80, -104, -108, -33, -23, 96, -92, -122, 22, -115, -13, - 11, 71, -48, 115, -32, -83, -40, -118, 40, -81, 2, -122, 57, 78, 50, 87, -52, 120, 23, - 101, 115, 22, 51, -35, 111, 1, 8, -10, -14, 9, 114, -27, -105, -100, -31, 57, -119, -12, - 2, 12, 18, 94, -29, -51, -44, -35, -69, -34, -109, -109, -102, 83, 79, 58, 105, 66, 12, - -34, 73, 2, 71, -96, -67, 111, 53, 33, -73, -112, -66, -100, 57, 59, 30, 21, 1, -31, - -63, -44, 14, -42, 107, -85, 122, 16, 99, 62, -54, 100, 60, 15, 113, 111, -108, -17, 37, - 34, 91, -121, 21, -73, -36, -38, -45, -116, 78, -54, -50, -43, 58, 92, -115, 123, 78, -98, - -123, 48, -78, -62, -127, 15, 119, -16, 72, 1, 70, -41, -121, -52, -105, 113, 68, 43, 60, - -109, 115, -7, -95, 37, -106, -1, 125, 31, 38, -8, -2, 2, 84, -86, 35, 28, -13, -92, - 49, 108, 13, -116, 33, 10, 23, -54, -66, -15, -59, 25, -122, -89, 36, 31, 37, 74, 15, - -56, 117, -33, 96, 51, -22, 17, -46, 28, 34, -5, 16, -70, -18, 19, -70, 88, -19, -22, - 76, 60, 66, -94, 7, 17, 82, 62, -5, -11, -103, -101, -109, 107, 20, -8, -40, -90, 3, - 75, 1, 60, -85, 29, -65, -108, -89, -78, -68, 66, 6, 18, -1, -35, -99, -58, 60, -35, - 6, 8, -49, 27, 40, 102, 15, 94, 34, -88, 123, 88, -14, 61, -94, -66, 90, -44, 72, - 33, 89, 116, -38, -61, 45, 118, 64, -103, 37, 32, 13, -68, -37, 118, 103, -17, -31, -79, - 36, 104, 96, 8, 99, 57, -54, -77, -116, -93, 67, -9, -49, -59, 15, -35, -106, 49, 33, - -73, -70, -54, 45, -36, 53, -124, 122, -10, -108, 29, -65, 10, -109, 52, 69, -76, -105, 86, - -107, -114, -70, 123, 52, -103, 42, 24, -101, 63, -17, -86, -1, -55, 23, 68, -60, -108, 89, - -94, -46, -36, 99, -38, 25, 107, -56, 86, -96, -5, -36, 56, -25, -74, 8, 121, -29, -100, - 98, 26, -93, 112, 64, 54, -7, -65, 97, -13, 124, -39, -78, 61, -73, 112, -127, 118, 83, - -76, 8, 121, -61, -84, 110, 18, 34, -57, 97, -65, 27, -94, 59, -9, -98, 11, -97, 70, - 11, -75, -104, -101, 123, 58, -25, -47, 107, -44, 87, 24, 19, 76, -3, 117, -90, 84, 104, - 29, 109, -111, 98, -83, 124, 35, 97, -60, 27, -43, 90, -5, 109, -119, 67, -60, 67, 93, - 50, -3, -119, -111, -96, 11, 63, 1, -67, -61, -103, -52, 58, 79, -76, -83, 115, -87, 126, - -3, 59, -128, -2, -16, -57, -123, -22, -72, -27, -67, -15, 124, -61, -85, -81, 72, 25, -35, - 119, -122, 114, 10, 27, 103, 114, 24, -116, -99, 114, 91, 1, 107, -60, -40, -73, 79, 10, - -69, -7, 84, 30, 39, 29, 81, 103, -32, -122, 76, -112, -108, 94, 102, 94, -128, -91, 52, - -26, -63, 4, 14, 126, -79, -38, 78, -120, -127, -37, -93, 111, 39, 21, -101, 115, -16, 74, - 44, -121, -14, 100, 82, 85, -4, 76, -125, 72, 37, -48, 0, 98, 9, -57, 33, -89, 81, - -26, 37, -48, -20, 94, 38, 85, -21, 84, 82, 95, -20, -127, -81, 110, -95, 66, -98, -82, - -2, 118, -59, 108, 109, 31, -48, -73, -10, 27, -73, 107, -80, 119, -76, -20, -86, -72, 35, - -23, -69, 28, -72, -44, -80, 18, -20, 69, 87, -29, 67, -58, 98, 47, -8, 23, 60, -98, - 18, 43, 18, -25, 6, 37, -43, -42, 114, -65, -22, 6, 62, 86, 81, -72, -54, 3, -64, - -6, -54, -96, -126, 93, 36, -6, 106, -90, -59, -97, -53, -124, -90, -85, -42, -36, -65, 14, - -81, -110, -124, 119, 29, -46, -24, -76, 110, -72, -28, -59, -67, 9, 72, -78, 65, 31, 84, - 103, 68, 66, -44, 91, -31, 17, -64, -58, -62, 118, -18, -27, 1, 126, 118, 53, 86, -66, - -96, 68, 6, -56, -76, 45, 83, 101, -17, 67, -79, -105, -71, 69, -85, -19, 42, -85, 5, - 89, 9, 92, -41, 3, -12, 72, -16, 87, -112, 89, -1, 92, 63, 114, -42, -115, -101, -75, - -85, -101, -110, -94, -61, 94, 96, 114, -50, -46, -21, 22, -51, -98, 120, 91, 113, 26, -71, - -117, 53, -55, 33, 40, -109, 20, 108, -11, 43, 66, -122, 19, 116, -3, 92, 78, 65, -76, - -83, 114, 119, -89, -66, -44, 113, -47, 3, -125, -31, -81, 94, 120, 50, -3, 71, -117, 63, - 110, 2, 111, -91, -73, 107, 38, 95, 33, -5, 58, 9, -83, -28, 123, 46, 99, 28, -6, - 103, 30, 97, -24, -1, -54, -123, 52, 61, -11, 122, -115, 14, -1, 68, 125, 115, -5, 102, - -93, 13, 122, -33, -33, -105, -81, 68, 106, -120, 18, -100, -40, 86, -95, -94, -56, -69, -2, - -100, 8, -48, 49, -59, 102, -72, -72, -19, -128, -69, -5, -103, 58, -110, 36, -6, -25, -104, - -16, 0, 96, 45, 30, -79, 71, 36, 39, 81, 74, -13, -112, 25, -60, -100, -96, -76, -42, - 96, 54, 41, -88, 83, -109, 70, -17, 72, -6, 15, 11, 53, 101, 18, -40, 22, -70, -99, - -19, 124, 61, -9, -5, -121, 13, 14, -30, 87, 82, 4, 31, 59, -29, 105, -81, 72, 63, - -24, 55, 120, 126, -55, 9, 50, -13, 79, -92, 16, 39, -49, -97, -60, -42, -68, 65, 33, - -116, 11, 26, -66, 7, 57, 96, -97, 67, 120, 107, -72, 54, -54, -117, 98, -22, -118, -71, - -90, 98, 68, 3, -3, 44, -8, -17, 65, 112, -64, -79, -78, -59, -6, 23, -97, -23, -68, - 116, 57, -11, 109, -11, 123, 8, -106, -74, 76, -126, 121, -113, -120, 105, -71, -7, -27, -29, - 43, -15, 102, 68, 18, 6, -20, -104, -27, 49, -128, -68, -104, 79, -81, -60, 86, 57, 15, - 11, -86, 65, 24, 39, 60, 71, -85, 30, 22, -19, 79, 110, 84, 105, 33, 36, 120, 17, - 73, 126, 46, 126, -20, -61, -40, 117, 52, -52, 63, 121, 90, -52, -3, -124, -80, -48, 103, - -67, 50, -59, -4, -93, 30, -113, 80, 77, -118, 87, 3, -122, 114, 72, -53, -53, 72, -26, - 107, -64, -55, 29, 98, 62, 1, 32, 64, 44, 3, 2, -61, -92, -5, -4, -86, 96, -46, - 88, 1, 78, -73, 119, -99, 84, 58, -84, 46, 4, -9, -112, -10, -54, -27, 118, 95, -118, - -106, -71, 50, 102, -4, 86, 60, 92, -86, 75, -58, -118, -46, 26, -21, 55, 11, 113, 80, - -123, -91, 93, -4, -119, -81, 117, 111, 24, -69, -96, 17, 66, -43, 89, 57, 101, 14, 27, - -27, -111, -38, 88, 116, -119, -98, 113, 2, -71, -35, -98, -22, -4, 102, 21, 109, -50, 26, - -35, -59, 87, 108, -56, -62, -127, -65, 115, 58, -99, 110, 122, 52, 19, 116, 39, 122, -46, - 105, -125, -94, 70, -85, 57, -111, -75, 118, -32, -30, 18, 114, 20, -46, -120, -69, -19, 62, - 42, 89, 125, 83, -72, 2, -11, -23, -57, -15, -112, 105, 9, 17, -19, 47, -22, -19, -74, - -65, -26, 46, 40, -94, -43, 55, 61, 1, -44, -111, 28, 124, -94, -105, 21, 48, 35, 108, - -42, -95, 3, 110, 31, 120, -79, -121, -37, 97, 111, 0, -103, -84, -58, 98, 31, 38, 93, - 12, -40, -47, -90, 125, -52, 58, -24, 67, -14, -18, 67, -39, -29, 5, 54, -107, -89, 5, - -67, -63, -108, -126, 93, 101, -14, 91, 59, -53, -87, 52, -2, -99, 14, -3, -66, 28, -9, - 85, -77, 120, -102, 114, -3, 122, 37, 116, -11, 13, -23, -117, -109, 81, 11, 33, -70, -93, - -65, -51, -105, -88, -22, 41, -102, 3, 42, -109, 126, -121, -21, -49, -3, 111, -37, -74, -11, - 90, 27, 17, -82, 74, 100, -59, 9, 67, -17, 110, 114, -19, 103, 104, -32, -97, -109, 83, - -5, -119, -17, -105, 59, -106, 115, -27, -109, -50, 54, -5, -103, -92, 121, -44, 121, -102, 45, - -51, -118, -42, -128, -118, -97, 108, -9, 114, -1, 3, -113, -54, -20, -5, 50, 40, -69, -66, - -80, -42, -30, -41, 56, 107, -93, 88, 44, 10, -27, 13, 69, -113, -114, -124, -49, 76, 82, - -79, -105, 5, 78, 52, 15, -27, 107, -61, 14, -49, -113, -53, 76, -96, 69, 116, 8, 4, - -54, 56, 27, -91, -59, 90, 34, 120, 46, -115, 2, -88, -83, -91, -100, 39, 68, -12, 10, - -83, 11, 85, -17, -21, 36, 38, 21, 48, -111, 121, -5, 26, -99, -98, -82, -29, 83, -124, - 94, -40, -106, 45, -104, 66, -79, -121, 105, -64, -91, -74, 14, 64, 119, -122, 76, -124, -40, - -80, 37, 7, 100, -89, -115, 111, 37, 11, 54, -114, -106, -34, 112, -15, 26, -125, -60, -118, - 43, 113, -121, -54, 103, -113, 41, 51, 15, 73, 123, 75, -8, 83, -28, -70, -29, 26, -92, - 119, 118, -105, -102, -111, 53, -127, -83, -55, -28, 121, -116, -76, 118, 123, 64, -99, -36, -126, - -101, 65, -105, -53, -109, -89, 104, 50, -55, 6, 62, 45, -127, -102, 117, 94, -60, -14, -96, - -110, 8, -18, 80, -30, -40, -69, 10, -85, 33, -60, 26, 3, -92, -38, -28, -115, 86, 46, - -42, 61, -92, -80, -128, 75, -93, 71, -5, 125, -64, -38, 52, 103, -68, 46, -35, -122, -58, - 3, 107, -22, 41, 42, 54, 41, 4, -57, -85, 40, 77, -37, -127, 63, -52, 18, 59, 35, - 123, 64, 124, 54, 126, -119, 2, 15, 44, -117, 117, 90, 77, 88, 27, 51, 120, 5, 42, - 94, -92, -119, -71, -63, 10, -87, 107, -121, -105, 110, -128, -61, 41, 80, -118, 123, 13, 53, - -57, -31, 0, -96, 1, -37, -128, 84, -79, -123, -66, 5, -7, 15, -69, -117, 14, -91, 0, - -108, -44, -46, -48, -32, 10, 4, -27, -47, -111, -39, 22, 44, -29, 94, -42, 47, -40, 71, - -72, 58, -125, 18, -121, 34, -29, 35, -6, 23, -27, -116, 102, 110, 89, 19, 60, -53, -73, - -26, 62, 43, -71, -11, -17, 73, 49, 7, 119, 3, -91, 46, 32, -5, 72, 102, -7, 43, - 72, 120, -46, -2, -37, 67, -88, -39, 103, 47, 37, -4, -32, -67, 116, -33, 27, 39, 109, - 43, -44, 57, -27, -73, -112, -52, -16, 88, 95, -18, 94, -35, -81, 107, 14, 124, 68, 41, - -76, 70, -41, 33, 124, 83, -103, -33, -110, 4, -117, 91, -3, 2, 76, 75, 108, -9, -37, - -118, -53, 18, 115, 19, -72, -93, -65, 6, -46, 76, 87, 15, -113, -121, -67, 46, -58, -107, - -84, 73, 58, 10, -17, 13, -9, -3, 70, -77, -56, 56, -114, 13, 59, 114, -109, -100, 84, - 45, -57, 41, 115, 108, -97, -53, -27, 22, 88, -91, 78, 6, 110, 1, -37, -8, -121, -115, - 34, -53, 26, 47, 2, -120, 56, -62, -90, -127, 40, -62, 44, 36, -18, 23, 124, 117, 92, - 113, 113, 23, 118, 85, 121, 46, -99, -63, -76, 86, 10, 2, 91, 57, 54, 96, 110, -40, - -103, 15, 11, -31, -61, 66, -81, 99, 111, 27, -106, -31, -41, -88, 70, -83, 64, 92, -72, - 124, -82, -91, 71, 32, -120, -119, 52, -23, 108, -100, 49, 102, -96, -32, 25, 87, -125, -41, - -106, -23, -86, -106, -92, 123, 59, -33, 38, 84, -108, 40, -85, -104, -47, -78, -14, -69, 30, - -114, -117, -77, 37, 55, 125, 105, -49, 70, 41, -19, 106, 69, -3, -98, -45, 75, -117, 93, - -69, 64, 89, 74, 8, 55, 109, 88, -126, 73, 67, -69, -59, 119, 74, -21, -80, -62, -88, - 104, 29, 4, -4, -86, 22, 111, 34, 44, -71, 7, -42, -106, 95, 113, -56, 95, -54, -58, - 93, 79, -82, -3, -81, -100, 118, -30, 34, 65, -120, 124, 102, -45, 26, 2, 111, 88, 24, - 23, 62, 74, 6, 61, 3, -119, 38, -17, -20, -80, 121, 50, 45, -128, 90, 41, -64, 70, - 8, -77, -106, 85, 63, 22, -26, -73, -91, 89, -13, 18, -61, -67, -55, -44, -49, -121, 113, - 56, -99, -121, 88, 14, 18, -105, -96, -15, 103, -119, 61, -100, 114, -84, 2, 78, 114, -95, - -32, 14, 93, -81, 31, 74, -71, -26, -98, -68, 60, 73, -90, -12, -116, 3, 15, 86, -28, - -78, 23, 17, -77, -44, 122, 106, -48, 8, 56, 103, 103, 90, 16, 110, -10, 113, -96, 113, - 88, 2, -77, 25, 36, 102, 107, -7, 32, 123, 42, 76, 74, -21, -105, -20, 113, 101, -103, - -56, 73, 15, 17, 48, -114, 86, 104, 41, 43, 15, 13, 43, 94, -115, 115, 22, -102, -78, - 49, -102, 26, 8, 33, 49, -48, 0, 17, 119, -7, 80, 42, 114, 65, 53, -10, 94, 112, - 89, -119, 15, 111, -27, 24, -35, -113, -69, -42, -41, -18, -41, -127, -116, -120, 37, -23, -108, - -11, -119, 113, -53, -2, 37, -6, -33, -30, -128, 53, -112, 44, -62, 95, -57, 72, 40, -68, - -56, -121, -90, 35, -91, 89, 57, -28, 120, -106, 91, -82, 87, 8, 89, 87, 44, -98, -19, - 32, 25, -64, -58, -91, -104, -48, 66, 59, 27, -63, -9, 23, -63, -84, -47, 7, 100, 31, - 83, 46, 64, 39, -121, 83, -37, -76, 96, -7, -71, 62, -53, -86, 8, -66, 71, -84, -6, - 67, 89, -31, -70, 62, -105, 24, 117, -104, -40, -96, 94, -119, -4, 67, 52, 124, -1, 71, - 106, 119, -80, -21, -33, 53, -61, -120, -45, -65, 118, -10, -7, 117, -2, -84, 48, -16, -23, - -108, -64, 46, 26, 116, 85, -81, -51, 89, -123, -36, 86, -28, -41, -69, -13, -107, 68, 76, - 20, 8, -19, 94, 58, -6, -118, 107, 91, -68, 95, -103, -65, 20, 40, 27, 122, -91, -33, - -38, -90, 52, 66, -58, -43, 71, -121, 45, 105, 35, 79, -104, -71, 68, 120, -53, -76, -120, - 112, -43, 43, 25, -120, -58, 111, -122, -62, 83, -32, 83, 56, 90, -91, -127, -11, 72, -123, - -56, -13, 28, 91, -66, -9, 72, -72, -95, 106, 32, 88, 65, -127, 90, 80, -83, -16, -19, - -3, 88, 69, 113, 58, 46, 82, -84, 76, 62, -50, 32, -109, -14, -73, -125, 75, 69, 34, - -74, -87, -104, -38, 80, 35, -65, -16, -1, -124, -111, 56, -51, 111, 37, -16, 34, -115, 7, - -8, 121, -31, 68, 76, 37, -80, -53, -38, 70, -128, 38, -62, -73, 82, 115, 38, 75, -105, - -72, -5, -53, 1, -125, -16, -101, 21, 5, -44, 48, -34, 71, 1, -79, -99, 120, -66, 91, - 36, -71, 118, -98, -124, -72, -55, 90, -2, -66, -70, -89, -19, 62, 62, -109, -75, -85, 13, - -81, 12, -25, 68, -51, -125, 58, 56, 95, -36, -23, -95, 38, -120, 101, 73, -3, -18, 25, - 42, -4, -60, 4, -54, 78, -90, 84, 30, -70, -58, 97, 96, -93, -10, 40, -89, 71, -4, - -101, 12, -4, 105, 49, -99, -44, 49, 100, 99, 106, 41, 72, 66, 78, 57, 21, -57, -59, - -91, 77, -104, -124, -57, -55, -123, -107, 63, -76, -30, 73, -10, 7, -48, -25, 47, 18, -37, - -17, -4, 1, 114, 124, 111, -117, 64, 42, 17, 84, 42, 12, 125, 110, 50, 24, 41, 53, - 94, -99, -15, 72, -21, 48, 88, 7, -5, 19, -117, 94, -56, -123, -76, -13, -67, -81, 108, - -20, 90, -79, 25, 98, -1, 46, -95, -31, 32, -83, 49, 41, -82, 90, -89, 62, -115, 120, - -74, 100, -106, -127, -125, 20, 45, -29, -22, -104, -101, -101, 56, -34, -114, -66, 64, 6, -7, - 32, -93, 62, 3, 28, -79, 41, 61, -98, 63, -99, -48, 62, -57, 121, 113, -120, -8, 98, - -24, 43, -5, 68, 112, 26, -16, -118, 92, -87, -52, 33, -106, -28, 36, -116, -48, 22, 2, - -25, -12, -96, -110, -127, -44, 64, -15, 48, 4, 95, 81, -12, 117, 98, -73, -103, 117, -47, - 88, 107, -75, -19, -18, 36, -122, -14, 73, -14, -69, -106, 92, 76, 60, -8, -46, 30, -113, - -88, 23, -26, 68, -10, 101, -77, -68, -94, 0, 24, -121, -61, 112, 43, -4, 39, -25, 0, - -116, -74, -94, -79, -125, -5, - -}; - -const static __attribute__((aligned(16))) int8_t input1_element[] = { - 85, -105, -10, -88, -126, 59, 111, 7, -63, 77, -32, -117, 86, 124, -119, 87, 56, 52, 67, - -29, 56, -19, -127, 7, -61, -5, 44, 25, -101, -60, -65, -23, 125, -88, -13, 87, -32, 18, - -16, 75, -5, 79, -12, 49, -35, 91, -5, 101, 60, -87, -93, -44, 99, 98, 16, -125, -101, - 67, -22, 37, -73, -72, 81, -61, 49, 11, -101, 7, -115, -55, -59, 23, 89, -91, 103, -37, - 40, 94, 88, -84, -86, -99, -119, 40, 79, 97, -112, 104, 51, 66, -5, -98, -48, 114, 39, - -59, -66, -117, -20, -66, 16, 79, 42, 79, -31, -112, -60, 103, 109, 115, -57, 72, 75, 93, - 69, -121, -45, 46, 79, -94, 66, -62, -32, 22, 120, 100, 72, 115, -47, 110, 121, 19, -61, - -127, -121, -30, -68, 1, 14, 58, -66, -119, 13, -89, -49, 71, 47, 96, 59, 73, 33, 108, - 92, -35, -124, -115, -120, -43, 10, 20, 114, 17, 14, -61, 8, 78, -79, 97, 8, -38, -27, - -9, 6, -20, 58, -12, -79, 40, 109, 17, 111, -99, 110, 34, -119, -86, 5, 16, 84, 4, - -9, 35, -8, -65, -2, 45, 104, 63, -33, 59, -19, -89, -59, 54, 46, 68, -101, -71, 41, - 92, -70, 96, -27, 123, 37, -95, -39, 124, -11, 81, 78, 83, -14, 36, -24, 74, -126, 92, - -100, 48, -44, -7, 87, 23, 82, 104, -126, 11, 77, 83, -59, 5, -2, 6, -72, 29, 39, - 49, -15, 67, 10, 21, -48, 41, 93, 76, -9, -113, 7, -105, -32, 70, 35, 125, -93, -78, - 45, -31, 122, -100, -8, -56, 83, 51, 54, 4, -7, -102, -44, -126, 58, -42, -63, -27, -35, - 84, -67, -109, 45, -106, 28, -62, 108, 88, -120, 22, -122, -23, -125, 85, -3, 68, 52, 58, - 24, 33, -22, 104, -77, 50, 65, -68, -91, -124, -115, -8, -109, 58, -56, -75, -65, -110, -120, - -113, -127, -4, 50, -49, -46, -57, -118, 21, -96, -33, 28, -55, -66, 65, 105, 94, 62, -123, - 105, 6, -28, 123, -102, -89, 60, 119, 32, 82, -61, 68, 92, 72, 22, -25, -56, 42, 47, - 113, 24, 92, 24, -98, 8, 89, 48, -14, 23, -116, -63, -32, 43, 125, 91, 9, 26, -114, - -33, -58, -99, -15, -79, 77, 88, 117, 18, -59, 84, -14, -42, -128, -56, 17, 19, 33, -69, - 83, 92, -29, 90, 85, -101, 14, -34, -68, -12, -11, 23, -59, 54, -115, 73, 79, 19, -7, - 11, 36, -23, -112, -11, -92, 33, -4, 31, -17, 96, -125, -27, -41, 21, -94, -111, 1, 39, - -117, 80, 69, 23, -70, -95, -20, -72, 125, 82, 105, 69, 95, 61, 42, 117, -85, -32, 76, - -98, 15, 86, -28, 73, 37, -1, -89, -8, -50, -33, -17, 17, -2, 21, -81, 6, -91, 109, - -102, -45, -49, 122, -72, -8, -54, 115, -50, -7, 44, 38, -70, -119, -1, 117, 2, -19, -24, - 86, -118, 49, 94, -33, -24, 67, -91, -76, -22, 100, 33, -10, -35, -35, 77, 45, -104, -77, - 108, 30, 14, -83, 35, 18, 82, 66, -13, -68, -20, -99, 94, 53, -90, 27, 27, 22, 104, - 14, -81, -20, 50, -80, 58, 71, 7, 68, 65, -21, -81, -37, -30, -98, -13, 29, 22, 80, - -92, 33, -100, -75, -2, -122, 31, -83, 5, 68, -97, 28, 67, -90, -69, -91, -50, -117, 27, - -60, 70, 62, 26, -63, -42, -14, 51, 22, -22, -69, -93, 123, 4, 32, -46, -54, 17, -41, - 106, -118, -69, -56, 9, -112, 124, -73, 16, 50, -42, -4, -123, 125, 100, -46, 92, 12, -19, - -63, -4, -29, 12, -125, -34, -51, -120, -6, -5, -122, -20, 19, -80, 112, 70, 28, 12, 25, - 112, 21, 38, 51, -120, -1, 14, 94, 83, 14, 119, 115, 42, 6, 96, -110, 47, 73, 121, - -120, 90, 41, 73, -100, -55, 63, 95, -73, -76, -126, 87, -95, -46, -112, 101, -69, 117, -85, - -28, 16, -47, 102, 8, -116, 12, 17, -24, -6, -96, -47, 38, 82, 78, -15, 15, -120, 66, - 116, -32, -11, -101, -49, -14, 121, -43, -97, -111, -118, -17, -8, 117, 11, -84, -12, 42, 41, - -102, 60, 22, -75, 86, 6, 30, 68, 126, -13, 13, 15, -93, 48, -78, -29, 14, -20, 43, - -88, -19, -36, -124, 93, -52, 63, 77, 33, -2, -65, 10, -76, 6, 83, -62, -42, 16, 112, - 108, -31, 11, 4, -64, -67, -50, -83, 7, 90, -29, -25, -121, 53, 45, -111, 48, 71, -19, - -69, -79, -86, 8, -106, -108, 44, 119, -102, -20, 42, -87, 110, 33, 67, 17, 92, 26, -34, - -123, 99, 120, -27, 79, 30, 67, -117, -118, 117, -91, -98, -126, -24, -78, 84, 18, 76, 6, - 124, 54, -108, 16, 106, -126, -87, -72, -105, -109, 86, -26, -54, -92, -67, -71, 82, -96, 53, - 12, 55, -76, -71, 49, -22, -111, -13, -63, -50, 14, -65, 51, -99, -21, 43, 63, -120, -103, - -23, 85, 40, -106, 50, 72, 8, 21, 94, -58, 41, 31, -16, 68, -125, 118, -77, 29, 91, - -33, -10, 97, -42, -96, -121, 87, 104, 5, 63, 96, -82, 33, -11, -58, 83, 48, -49, 66, - -35, 54, 125, 5, -64, 22, -4, -94, -53, 122, -91, -73, -38, -41, 106, -79, 39, 70, 49, - 40, -39, 61, 52, 40, -45, 89, 10, 17, 55, 73, -112, -64, -30, -83, 112, -118, -35, 24, - 10, 62, -69, 85, 39, 19, -32, -35, -97, 48, -100, 91, 116, 92, 60, -103, -60, -53, 55, - -98, 81, -52, -108, -6, -5, -83, 111, 48, 124, -63, 0, 19, -40, 100, -6, -114, -24, 87, - -97, 97, -34, 99, 117, 51, 39, 117, -51, -13, 10, 1, -119, -65, 110, -56, -53, -61, -100, - -84, 123, 0, 74, -101, -23, -78, -91, -70, 20, 46, 94, 86, -81, -39, 97, 28, -64, -67, - 54, 73, -92, -75, 41, -10, 108, -91, -46, -34, -71, 118, -71, -60, -29, -48, 59, 17, -79, - 98, -9, 66, 1, -123, -120, -22, 11, 113, -108, -127, 87, 72, -52, 78, 64, 53, -82, -98, - -45, 26, -93, -109, -125, 90, -56, -12, -128, -95, -81, 62, 28, -102, -127, -12, 120, 85, -115, - -31, 13, -40, 50, 43, 95, -101, -5, -83, 68, -15, -102, -34, 123, -112, -50, 6, -71, -117, - -122, -101, 32, 3, 85, -121, -111, -53, 113, -22, 58, -42, -7, 102, 44, -39, 66, -109, -101, - 61, -12, 38, 87, 43, 33, -127, -128, 110, -93, -104, 4, -71, -108, -122, 20, 68, 19, -12, - -91, 37, -122, 39, 76, 122, 21, 37, -58, 66, 112, -49, -61, 48, 111, -83, 74, -24, 10, - 36, -15, -52, -102, -109, 68, -65, -5, 72, 54, -58, -43, 91, -67, 34, 35, -100, 28, -99, - 41, 84, -29, -16, -99, -59, 125, -3, 113, -79, 20, 94, -87, -82, -110, 77, -6, 7, -21, - -101, 52, -97, -23, -107, 28, 86, -95, -19, -8, -31, 86, -62, -2, 82, 65, 18, 12, 85, - -120, 35, -83, -14, 60, -105, 100, -101, -123, -40, -46, -128, 51, -78, 96, -20, -25, 3, 79, - 66, -88, 98, -42, 73, -85, 9, -89, 13, 6, -18, -17, 112, -110, 90, -102, 124, 74, 31, - 40, 106, -107, 8, -128, -19, 13, -45, -102, 84, -44, 80, -53, -7, -104, -85, -109, 101, -78, - 42, 44, -84, 40, 42, -108, -112, -58, -78, 72, 5, 111, 72, 30, 26, 9, -10, 106, -79, - 59, -9, -99, -32, -116, 94, -95, -106, -125, -128, -94, 19, -74, 32, 105, 111, 26, 122, 67, - 4, -83, 86, -8, 30, -114, 57, -97, 72, -103, 6, -96, -68, -100, -65, -96, 13, 83, -51, - 96, 68, -83, -77, -102, -53, 91, -15, 116, 47, -67, 23, 75, -3, 25, -50, 101, 19, 28, - 7, 45, -9, 118, -44, -21, -96, -16, 43, 119, -73, -86, 20, -19, -110, 89, -122, -116, -125, - 78, 63, -4, -74, -24, -60, -23, -124, -125, -65, -89, -105, 92, -100, -1, -14, -95, 16, 37, - 77, -2, 0, -60, -55, -3, -123, 67, 35, -101, 105, 12, -63, -128, -36, 61, 30, -88, 36, - 88, -78, -22, 65, -106, 20, 44, -28, -38, 71, -68, 123, -105, -7, -65, -99, -66, -125, -119, - -80, 38, -93, -59, -62, 28, 80, 37, -39, 126, 5, -102, 59, -84, 10, -96, -52, 106, -99, - 20, -3, -35, -68, -32, 53, -115, 34, 29, 78, 12, 40, -48, 94, -94, 42, 71, -107, -80, - -7, -112, 27, 49, 86, 73, 77, -14, -49, 107, 7, -60, -99, 99, -71, -118, 89, -39, -102, - -15, -125, -2, -48, 74, -59, -29, 10, -82, -19, -115, -106, -11, 65, 33, 70, 92, -55, 7, - -46, 16, -15, 61, -74, 70, -6, 68, 18, -32, 122, 113, -40, -119, -8, 46, -119, -17, -55, - 98, 82, -72, -2, -81, 52, -111, -2, 70, -11, 65, 120, 55, -106, 74, 123, 115, -46, 52, - -117, -66, -17, 30, -61, -92, 85, 5, -41, -80, -30, -122, -119, 21, -117, 63, 40, -128, -62, - -50, -33, -24, 6, 94, 62, -102, -111, 28, 73, -58, -122, -15, -41, 61, 35, 24, 80, 34, - -86, 28, 118, 59, 67, -19, -49, 22, 67, -26, 79, 66, -25, -31, 24, -128, 83, -89, 72, - -63, -101, -45, 64, -28, -74, -66, -66, 24, -118, -22, -42, 69, 105, 74, -24, -80, 100, -7, - -47, -20, 25, -43, -7, 122, 86, 75, -23, -96, -97, 23, -62, 59, 41, 111, -90, 39, 51, - -84, 124, 118, -22, 49, 23, -94, -77, -24, -82, -17, -60, -3, 84, 17, 77, 76, 97, 76, - 21, -27, 61, -21, 104, -35, -31, 104, 95, 70, -108, -63, 89, 125, -125, 18, 42, 63, 35, - 65, 45, 93, 17, 40, 71, -64, 99, 13, -110, 62, 38, -74, -106, 85, -49, -72, 57, -120, - 91, -94, -43, -75, -88, 48, 79, 77, -123, -62, 42, 54, -68, -109, -91, 98, 3, 46, -48, - -110, -55, -61, 110, -125, -64, -67, 37, -98, 126, -100, -70, -107, 66, 77, 24, -34, -87, 65, - -109, 65, -76, 72, -121, -113, 71, 13, -20, -84, -60, 106, -44, -46, 22, 44, 96, 59, 40, - -45, 10, 10, 66, -82, -33, -10, -128, 103, 110, -93, 103, -26, 50, -108, 8, 116, 53, -95, - -13, -74, -8, -38, -93, 69, -43, -91, -92, 78, 65, -53, -119, 40, 10, 104, -113, -67, -48, - -81, -49, 27, 101, 19, -66, 57, -21, 19, 3, 68, 76, -88, -125, 75, -123, 49, -52, -84, - 16, -88, 123, -90, 108, -55, -3, -120, 98, 34, 108, -84, -99, -109, -8, -58, -10, 94, -51, - -49, 6, 82, 104, -77, 62, -112, 22, -98, 87, 76, -104, 70, 21, -85, 39, 92, -87, -55, - 61, -17, 61, 18, 69, 14, 75, 20, -35, -82, 62, -10, -43, -91, -47, 55, -90, 99, -108, - -124, 104, 46, 30, 71, 24, -99, -128, 124, -38, 42, -77, -81, 106, 35, 25, 13, 21, 16, - 43, 35, 91, 84, 45, 27, -15, -67, 83, 15, 32, -39, 74, 25, 22, -3, 12, 38, 3, - -14, 71, -26, 9, -9, 95, -112, -45, 58, 2, 122, 54, 22, -22, -50, 110, 15, 109, 4, - -106, 73, -34, -101, -73, -101, -123, 95, 78, 113, 79, -104, 61, -116, 65, -56, 41, -86, -10, - -104, -100, 50, -60, -32, -96, 99, 57, -25, 50, 81, 22, -107, 95, 57, 50, -51, -101, 36, - 98, 92, 11, 29, -47, 61, -6, 62, -40, -52, 68, 124, 25, -17, 35, 98, 68, -86, -10, - -44, 67, -44, -29, 26, 109, -96, -60, 3, -81, 10, -90, 105, -66, -42, 77, -60, -45, 34, - 52, 84, 105, 40, 66, 47, -26, -36, 52, -13, 2, -121, -72, -79, 8, -32, -29, 104, -19, - -65, -83, 54, -18, -23, -50, -67, -26, 87, -117, -2, -59, 49, -116, -33, -39, -39, -72, -59, - 73, 17, 105, -36, -96, 104, 28, -128, -116, -124, -104, -77, -107, -44, 16, 123, -33, -115, -123, - 98, 37, -33, 118, 55, 60, 119, -122, 25, -47, 22, 114, 19, -12, 5, 17, -51, 15, -78, - -47, 114, -83, 54, 33, 53, -94, -7, 84, -107, 98, -123, -37, -35, -70, 13, -123, 116, -3, - -65, -4, 70, 84, -18, -17, 107, -111, 15, 28, -98, -55, -93, 59, 0, -60, -25, -46, 81, - -10, 104, -63, -30, 63, -125, -119, 2, -37, -22, -2, -65, -41, -123, -112, -6, -25, -42, -72, - 77, -22, 109, 99, 125, 116, 20, 56, -91, 93, 86, -101, -73, 96, 115, -8, -104, -50, 87, - -114, 9, -8, 16, -53, 71, -63, -110, -30, -108, -81, 40, -91, 79, 97, 36, -31, 62, -60, - -115, -122, 13, 60, 120, -23, -15, -39, -63, -23, 58, -126, -9, 5, -98, -31, -8, -58, 40, - 113, 45, -81, -83, -8, -83, 105, -70, -118, -105, -109, 117, -76, -96, 5, -62, -57, -41, -2, - -9, -107, 72, -49, 51, 33, 3, 23, -1, 15, 15, 15, -98, 7, 15, -4, -116, 80, -38, - 48, -42, -66, 16, -128, -73, -63, -32, -5, 97, 2, 66, -14, 16, 24, 57, -44, 65, 55, - 20, 115, -4, -102, 60, 21, 61, -15, -102, -108, 75, 99, 43, 107, 126, -4, 85, 124, 88, - 91, -125, 85, -44, -29, 39, -2, 80, -22, 66, 84, 119, 41, 14, 78, 97, 76, -62, 43, - -90, 47, 108, 122, -6, 73, 9, -64, 113, -81, 1, 113, 4, -32, 104, 111, 96, 24, 64, - -32, -81, -105, -76, 21, 70, 5, -5, -77, 19, -66, 107, -59, -17, 60, -83, -36, 90, -26, - 67, -69, -52, -14, 110, -113, 107, -49, -40, 117, 103, 114, -83, -54, 97, 78, 8, 88, -84, - -37, 28, 42, -100, 1, -7, -112, 111, 60, -120, 7, -123, -89, 102, -30, -16, 33, -123, -88, - -105, -2, 87, -21, 39, -22, -30, -105, -38, 33, -25, 9, 90, -69, -121, 102, 99, 15, -105, - 37, -62, -115, 3, -97, -12, -73, 7, -6, 67, -29, 97, -112, -116, -106, -121, 49, 18, 19, - 51, -44, 60, -88, -53, -19, -47, -86, 86, 108, -102, -22, 102, 119, 118, -25, -56, -6, -53, - -115, 123, -98, -81, 121, 18, 53, -106, 66, -117, 89, -57, -94, -90, -62, -104, -87, 114, 109, - 116, 120, 14, 4, -92, 56, -79, -21, -31, 92, 61, -54, -8, -84, -21, 46, 76, -112, -105, - 98, 17, 49, -44, -74, 76, 31, 116, 35, -22, -61, 34, 39, 92, 36, 101, -28, 115, 56, - 33, -41, 58, -117, 71, 65, -59, -24, 62, 119, 4, 34, 33, 85, -25, 7, 38, -10, 64, - 120, 110, 95, 77, -1, -115, -88, 88, 121, -106, -60, -31, 58, -45, -106, 90, 89, 73, 119, - 107, -99, 49, 57, 14, -96, -57, -43, 3, -75, -62, -46, -44, 39, -65, 78, 91, 43, -50, - 94, -96, 107, 23, -39, 77, -114, -47, -30, 33, -102, -48, -52, 4, 25, 79, 27, -36, -86, - -64, 26, 71, 98, 26, -121, -82, -97, 82, 103, -62, -125, 31, 121, -4, -10, 24, 63, 107, - 95, 13, -95, -107, 92, -96, 98, -37, 64, 90, 39, 81, 43, 67, -93, -92, -118, 61, -44, - -31, 28, 7, -48, -67, -74, 14, 64, 106, 100, -40, -92, 73, -41, -62, 23, -90, 24, 43, - 58, 58, -77, -107, -89, 67, 117, -72, 2, -91, -24, -10, -55, -97, 40, 4, -67, 61, 4, - 125, -42, 111, -109, -17, 75, -118, -16, -17, -105, 119, 65, 59, 60, 85, -109, 37, 93, -89, - -70, -110, 38, -81, -72, -24, -7, 74, -60, 76, 51, 34, -124, -54, 9, -15, -83, -10, -91, - -46, 112, 59, 59, -104, 121, 22, -46, 58, -46, 97, -31, 72, -67, 125, 63, 41, -88, 36, - -100, -57, 91, 63, 99, 42, 64, 33, -92, -101, -123, 111, -127, 114, 75, -62, -75, 90, -70, - 81, 76, 2, -128, -43, 3, -61, -62, 57, -89, 12, -64, -26, -65, -121, -123, 73, -108, 66, - -74, -114, -85, -19, 66, -56, -84, -111, 76, -29, -19, 34, -48, -32, -60, 57, -43, 54, 122, - 106, -72, 66, 13, 11, -57, 96, -4, 68, -119, 55, -82, -77, 72, -72, 66, 75, 10, -21, - -8, -38, 79, 5, 90, 35, 97, -90, 7, -83, 82, 115, 110, 44, 72, -102, -113, -69, 123, - -26, 60, -83, -84, -21, 90, -24, 64, -28, 15, -72, -106, 103, -46, -32, -109, 50, -52, -92, - 36, -93, -124, 110, 98, -109, -107, 100, -60, -9, -25, 118, 73, 24, 71, -25, 10, -11, -58, - -95, 71, -124, -66, -97, 126, -96, -17, -14, 0, 38, -106, 28, -107, -60, -39, 113, 66, -77, - 1, -72, 45, -89, -85, 70, 29, 94, 122, 60, -52, -123, 46, 24, 66, 78, 81, 61, 63, - 41, -27, -15, -128, 44, 6, -106, 30, 14, -56, 90, 15, -123, 55, -74, 95, 71, -105, 0, - -9, 95, -74, -33, 20, -14, 102, 125, -26, -87, -123, 104, 64, -10, -75, 22, -57, -105, -33, - 76, 85, -18, 33, -124, 89, 109, 9, -31, 114, -25, 19, -34, 15, 84, -9, -108, 67, 39, - -95, 75, -77, -77, 108, -124, 84, -79, 29, -23, -54, -82, -34, 80, -58, 39, -97, -56, 103, - -50, 12, 15, 25, -116, 126, -55, -55, 119, -76, -82, 27, 64, -60, -59, -123, -11, 43, 4, - -31, 52, 74, 24, -78, 54, -98, 89, 123, 83, 55, 85, 56, -75, -77, 28, -63, 102, -49, - -66, -113, 113, 19, 30, 115, -41, -66, -107, 11, 10, -86, 115, 35, -84, 123, -118, -8, 20, - -117, 96, -122, 74, -67, 8, -44, -18, 79, 117, 28, -102, -108, 112, 124, -127, -90, -106, 26, - 14, -91, 2, 8, -60, 67, -35, 76, -65, 82, -59, -42, -15, -108, 59, 48, -97, -21, -46, - 66, -49, 13, -43, 74, -41, -121, -88, 49, -94, 14, 43, 66, -115, -8, -49, 93, 0, 63, - 85, -61, -105, -107, 43, 42, -2, 110, 106, 95, -4, 104, -18, -48, -76, 18, -126, 85, -95, - 112, 106, 0, -128, 113, 31, -103, -35, 71, 76, 59, -14, -40, 15, 113, -109, 23, 16, 102, - -96, -106, -85, -63, 86, 105, 23, 108, 7, 80, 83, -85, -3, -105, 110, -99, 123, 77, 85, - -7, 82, 25, 102, -73, -116, 62, -118, -58, -34, -11, 66, 84, -84, -124, -64, 95, -38, 103, - -104, -125, -18, -103, -47, -32, 97, 6, 26, 75, -42, 111, -124, 67, -31, -13, -37, 72, -47, - 77, 0, 99, 79, -76, 17, -51, 91, 82, -83, -58, 65, 72, 82, -49, -90, 6, 54, -71, - -58, 120, -3, -33, -91, -3, 119, 7, -106, 0, -20, 46, 48, 95, 61, -93, -73, -61, -108, - 78, 83, 23, -111, -91, 17, 55, 77, 69, -121, -66, 114, -87, -19, -12, 65, 63, 17, -74, - -113, -48, 27, 119, 40, -40, 96, 93, 38, -16, 120, 63, 5, -66, 88, -36, -53, -92, 0, - 15, 11, -45, -104, -31, 106, 48, -62, -115, 92, -108, 73, -88, -31, -94, -9, -40, 34, 31, - -22, 20, 20, 69, -18, 66, 63, 101, 27, 40, -43, -23, 31, -53, -7, 52, 2, -94, -40, - 71, 61, 81, -50, -7, -124, 115, 39, 11, -90, -48, -87, 46, -8, 104, -60, -11, 96, -36, - 39, -50, 49, 29, 11, -118, 70, -46, -103, -93, -2, -45, -68, 9, -35, 60, -110, 125, -21, - 37, -105, 4, -127, 94, -108, -14, 29, -93, -63, 0, -53, -83, -108, -98, -83, -101, 55, 65, - -32, 51, -75, 126, 126, -126, 49, 10, 68, -123, -72, 63, -72, 12, 9, 57, -71, 114, 110, - 16, -31, -110, 109, 91, -52, 29, 68, -26, -30, 23, -110, 59, -50, -76, -50, -43, 89, 96, - 102, -19, 20, -118, 1, -3, 104, 95, -21, -15, 28, 2, -39, 24, -48, 30, 104, -20, 22, - 119, 118, -59, -119, -94, -65, 58, -53, 88, 104, 58, 87, 63, 95, 92, 19, -19, -22, -122, - -71, 61, 118, 93, 119, -68, -85, -77, -23, -71, 96, -115, 42, 82, -60, 77, -34, 3, 96, - -92, 119, -64, -103, 51, -94, -22, -106, 75, -41, 26, -41, -54, -102, -40, 55, -122, -105, -96, - 80, -49, -37, -25, -95, 39, 36, 54, -59, 26, 123, 19, 113, -105, -90, 17, 62, -112, -126, - 97, 44, 89, -12, -72, 96, 68, -14, 79, -124, 54, -110, -30, 108, 68, 80, -5, 50, -118, - 18, 120, -41, -24, 15, 110, -62, 84, 62, 22, -97, 61, 19, -13, 44, 89, 54, -5, -4, - 111, 2, -121, -121, -57, 119, -69, 56, 72, -68, -108, 96, 125, 28, -97, 81, 11, -59, -99, - 86, 24, -87, 69, 84, -1, 125, 83, -11, 21, -34, 55, 52, 50, -126, 87, 98, 15, -59, - 14, -47, -104, 27, 0, 33, 30, -16, 19, -25, 91, -69, 106, -107, -45, -21, 114, 92, -63, - 86, -9, -39, 43, -128, -125, -73, 92, 73, -97, 76, 113, 89, 28, 74, 86, 82, 24, 96, - 35, 25, 104, -85, 119, -5, -16, -105, -105, -11, 126, 28, -102, -96, -77, 58, -111, 55, 2, - 20, 14, -27, -111, 60, -45, 81, 52, 101, 84, 56, 28, 120, 6, 51, -87, 125, -69, -76, - 82, 31, 68, 73, 108, 111, 122, 110, -54, 75, 61, -38, -125, -10, 0, -116, -74, -71, 48, - 76, -126, 7, 79, -4, -49, -13, 100, 115, 78, -82, -22, -6, 81, -40, -75, -115, -19, -14, - -47, -25, 59, -121, 109, 1, 35, -104, 101, -17, 94, -127, 126, 95, 115, -110, 53, 82, -18, - 46, 23, -2, -25, 41, -2, -51, 116, -55, -90, 80, -107, -61, 5, 19, 126, 17, 30, -113, - -114, -13, 35, 74, 63, 1, 102, -51, -99, 67, -17, -72, 82, -11, -57, -108, 105, -96, -59, - 95, 91, 111, 78, 119, 85, 79, -98, -74, -3, -15, 36, -45, -56, 31, 30, 46, -21, 25, - 30, -26, 109, -47, -1, 1, 33, -89, -72, -46, -67, 16, 54, -118, -4, -106, 61, -122, 97, - 77, 96, 92, 6, 110, -24, 11, -103, -21, -18, -88, 42, -116, 6, -79, -104, -29, -11, -105, - 66, 70, 106, -118, 69, -51, -59, -87, -91, 25, 111, -102, 84, -84, -58, -27, -38, 21, -3, - -14, -110, -22, 22, 26, 23, 50, -103, 39, 42, 111, -35, -39, -87, -12, -73, -10, -91, 6, - -63, -10, 36, -24, -121, -67, 33, -88, -22, 122, -93, 24, -54, -89, 1, -61, 3, 66, -60, - -72, -45, -100, 6, -65, 107, 34, 105, -100, -58, -61, 62, -26, -112, 20, -25, 7, 111, -90, - -120, 95, 109, 91, -32, 59, 42, 67, 73, 45, 50, -106, 99, -35, 82, -75, 60, 36, 89, - 122, -49, 8, -44, 31, 35, 17, 21, -58, 24, 29, 49, 67, -76, -23, -62, -6, -103, 22, - 122, -19, -41, -121, -121, -6, 95, 59, 46, -72, -113, -68, 59, 39, 70, 15, 93, 99, -8, - 14, 14, 37, -7, -7, 16, 68, -86, 75, -16, 123, -12, -78, -102, -2, 35, -52, -95, -11, - -63, 66, -88, 29, -9, -6, -101, 69, 98, -113, 55, 64, -83, 125, 90, 20, -114, 17, 82, - -38, -100, 21, -3, -66, -26, -115, 17, 5, 88, 2, 85, 110, -30, -122, 69, -103, 117, 22, - 22, 70, -100, -123, 123, 78, -74, 105, 97, -30, 93, 13, -20, 66, 114, 99, -110, 121, 79, - -14, 34, 19, -102, -45, -36, -122, 3, -60, 37, 95, 107, -106, 83, 16, -73, 28, -10, -119, - 31, -110, -43, -65, 56, 48, 78, 22, 60, -121, -28, -72, 75, -78, 61, -62, -94, -19, -121, - 124, 98, -6, 121, -11, -67, -122, -6, -40, -85, -120, 68, -11, 0, -99, -119, -36, 98, 104, - 60, 110, -89, 48, 10, 114, -119, 45, -111, 32, 126, 28, 9, -1, -45, 121, 95, -66, 124, - 91, -84, -56, 16, 71, -2, 30, 78, -43, 87, 70, -86, 22, 114, -46, -61, 57, -86, 26, - -120, 98, -58, 112, 117, 83, 70, 44, -26, -103, -118, 3, -93, -20, 108, 66, -108, 55, 5, - -2, -122, -127, 64, -84, 70, -121, 115, -119, 48, -89, 57, -61, 32, -99, -102, -66, 88, -51, - -111, -115, 26, 118, 46, -35, -18, -113, 125, 11, 95, 96, -68, -59, -35, -108, 8, 75, -17, - 90, 103, -60, 32, 108, 111, 50, 121, 11, -14, -59, 16, -14, -42, 40, 24, -32, 100, -110, - -44, 81, -23, -63, 96, 1, 57, -26, 123, -8, 115, -52, -55, -82, -127, 92, -42, -48, 98, - 101, -14, -71, 42, 15, -59, 23, 97, -24, -128, 4, 125, -3, 24, -92, -127, -36, 4, -29, - -24, -49, -8, -41, 47, 65, -18, 66, -104, -17, -110, -59, 77, 80, -38, -106, 108, 30, -19, - 83, 97, -73, -33, 74, 92, -105, -119, -61, -99, 99, 37, -105, 38, -103, -92, -89, 91, 38, - -61, -54, 47, 38, -91, 50, -49, -90, -11, 45, 55, 55, -94, -38, 8, -7, -105, -105, -118, - -51, -99, 17, 39, 83, -11, 126, 69, 26, 1, -48, -33, -127, 18, 19, 97, -18, 94, -7, - -114, -13, 69, 74, -65, -9, -58, -119, 67, 3, -64, -97, 15, 97, 80, 64, 46, 122, 78, - -79, -104, -52, -94, 44, -124, 2, -4, -108, -106, -18, 67, -114, -117, -117, 38, 84, 102, -46, - 7, -65, -97, 125, -128, 92, 124, 85, -124, -71, 69, 82, -52, -95, 14, 84, -8, 83, -33, - 10, -78, -15, -39, 0, -67, 53, -58, -85, 34, -34, -50, 7, 96, -79, 57, 27, -35, 6, - -12, -75, -46, -53, 4, -20, -10, -83, 12, 41, -122, 31, 29, 18, -68, 46, -80, 43, 92, - 90, -35, -123, -128, 126, -94, 60, -109, -18, 9, -99, -116, 59, 7, 114, -122, 77, 113, 96, - 75, -22, -80, -45, 61, -23, 24, -43, -17, 42, 14, -33, -55, -73, 98, 58, -61, -80, 38, - 83, -11, 15, -59, 105, 73, -110, 29, -64, -103, -14, -118, -101, 71, 62, 119, 16, 90, 15, - 52, -16, -110, 58, 54, 55, 24, 110, -97, -61, -74, -1, -118, -35, 90, -25, 64, -74, -128, - 45, -86, 3, 54, 111, 102, -100, -52, -86, 1, -56, -76, 24, -124, 118, 45, -121, -1, -43, - -9, 39, -102, 47, -63, 82, 38, -18, -43, -6, -27, 115, -25, -90, -103, 18, -10, 15, 11, - -1, 104, -51, -58, -83, 64, 102, 18, -106, -103, -84, -106, -79, -100, 2, -77, 39, 116, -108, - 24, 42, 19, -64, -73, 9, -112, -93, -123, 90, -64, 1, -64, -2, 77, -57, -30, 87, -39, - 11, 3, -112, 119, 8, -26, 109, 61, 12, 54, -86, -55, -97, 11, 6, -89, 78, -73, 99, - 12, 59, -73, 66, -125, -96, -74, 66, -81, -46, 96, -80, -56, -22, 28, 71, -23, 28, -95, - 123, 63, 49, -66, -125, -93, -97, -20, 35, 126, 103, -79, 119, 52, -115, 112, 31, 33, 53, - -11, 23, 121, 121, 39, -119, 41, 73, 62, 107, -106, 3, -97, 42, 51, -85, -6, 59, -42, - 9, -84, 56, -15, 120, 83, -44, -82, 83, -45, -77, -111, -56, -114, 1, 116, -87, 22, 36, - -74, -1, 15, -90, 53, 59, -40, -124, -42, 125, 32, -47, 24, 111, -35, -59, 110, -113, 26, - -15, -14, -30, 17, -1, 12, 0, -57, -18, -50, -25, 0, 88, -124, -81, 96, 23, 42, 86, - -97, 114, -41, -1, 96, -41, -68, -21, -43, 107, -46, 54, -19, -5, -120, 101, 103, 32, 50, - 120, -21, 51, -116, -98, 91, -36, -56, -70, -10, 112, -29, -109, -59, -2, 37, -114, -25, 10, - -11, -16, 33, -72, 17, -115, 94, -69, -123, 125, -40, 42, 116, -30, 91, 48, 90, -100, 48, - -85, 16, -124, 4, 113, -127, -9, 62, -65, 67, -95, -99, -68, 54, 99, -46, -56, 1, -34, - 14, 75, 42, 118, 9, -29, -35, 48, 45, -63, 83, -87, 118, 94, -47, -84, -70, 33, -15, - 9, -20, -78, -36, -112, -101, 122, -6, -84, -114, 49, -17, 95, 81, 124, 55, -9, -4, -39, - 12, -83, 3, -17, 27, -60, 18, -26, -69, 21, -77, -34, 54, 2, -90, 61, 15, -87, -7, - 118, 88, 52, -128, 37, 55, -88, 23, -37, -22, 50, 116, 43, 53, -28, 89, 47, 45, 54, - -6, -37, -36, 61, -69, 76, 86, -75, -98, -8, 8, -10, 90, 77, 70, 108, 73, 7, 19, - 122, -9, -39, 76, 71, 91, -107, 123, -122, 74, 102, 79, -115, 27, -32, -114, -71, 30, -87, - 11, -45, -95, -86, 86, 32, -10, 77, 26, 59, 42, 50, 93, 42, -3, 13, 38, 75, 11, - 6, -128, 81, 28, -12, 1, -75, -96, 86, -63, -52, 26, -103, -98, -102, -10, -79, -79, -123, - -33, -91, -122, 83, -112, -41, 45, -42, 91, 67, 19, -10, 80, 98, -78, 30, 120, -127, 79, - 126, -116, 48, -85, 83, 53, -71, 10, 67, -109, -40, -4, -49, 64, 64, 17, 84, 1, 118, - 73, 95, 25, -87, -116, 60, 34, 115, -47, 92, 63, -111, -46, 79, -108, -49, -96, 121, -74, - -126, 56, 85, 121, -14, -108, 33, 40, 108, 102, 102, 48, -8, -86, -52, -126, -82, 88, 33, - 26, -123, 18, -104, -5, -103, -44, -101, -55, -84, 120, -85, 21, -27, -100, 40, 106, 78, 8, - -46, -21, -5, -68, -128, -23, -66, -47, 21, -110, -7, -118, -95, 125, -119, 9, -125, 48, -57, - 106, 12, -80, 111, 35, 126, -80, 80, -99, -76, -127, -90, 112, 9, -6, 57, -113, -111, -14, - 80, 32, 97, 99, -57, 19, -103, -19, -117, 15, 119, 117, 103, -124, -96, 90, 123, 8, 63, - 10, -55, -21, 92, -105, -81, 94, 72, -87, 91, -13, -9, -120, -52, -110, 82, 97, -35, 71, - -71, 36, 51, 44, -9, -10, 31, -39, 114, -88, 7, -31, 108, -125, 80, -116, -88, 115, -116, - 23, -55, -91, -122, 0, -105, 116, 108, 124, 83, 77, 108, -83, -5, 118, 51, 79, -58, -30, - -85, 20, 47, 92, 105, 124, 86, 24, 9, 120, 40, 91, 6, -44, -27, -51, -67, -123, -78, - 41, -79, 122, -35, -11, 65, 120, -31, 45, -29, 102, 50, -85, -67, -27, -47, -20, 96, 53, - -73, 4, -77, -86, -77, -22, -26, 88, -46, 42, -20, 96, -63, -62, -128, 16, 114, -103, -82, - 7, -87, -120, 53, -59, -7, -44, -114, 97, -102, 88, 111, -2, 25, -46, -36, -107, 7, -18, - 28, 45, -118, 25, 7, 118, -12, 86, -101, 27, 26, 71, -67, 3, -63, 103, -107, 4, -55, - 14, -25, 18, 52, 68, 13, -62, 4, -14, 22, 84, 28, 107, 101, -78, -10, 52, -34, 93, - 42, -87, -94, 79, 20, -119, 99, 98, -8, -46, 36, 54, 37, -18, 94, 118, -98, -125, -54, - 13, 70, -49, -17, -74, 14, 123, -67, -48, -120, 108, -110, 95, 25, 89, -2, 59, 36, -15, - 111, -105, 126, 84, -100, -24, -118, -44, -44, -53, -41, -56, -53, 99, -115, -9, -118, 123, -108, - 117, -118, 22, 100, 90, 35, -35, 29, 69, 41, -114, 44, -96, 116, 111, -98, -69, -30, -70, - -77, -94, 112, -8, 113, -52, -118, 19, 66, 41, 23, 26, 122, 38, -94, -103, 86, -94, 28, - 87, 8, 50, -55, 68, 48, -41, 67, -41, 95, 1, 63, 114, -5, 70, -44, -112, -89, -117, - -32, 13, 36, 76, -31, 123, -3, -110, -75, 57, 61, -107, 27, 52, -40, -29, 122, 96, 11, - -112, -113, -25, 34, -107, -64, 100, -111, 33, -71, -68, -105, -23, -26, 27, -4, -110, 90, 73, - 25, 66, 36, 24, -54, -21, -69, 98, -98, -117, 27, -56, 124, -66, 24, 65, 65, -49, -78, - -116, -55, -6, -95, -61, 13, 99, 88, -72, 120, -108, 31, 32, 47, 74, 18, 43, 12, 83, - 94, -116, 44, -64, -124, -4, 118, -22, 0, -40, 65, 61, 73, -121, -84, -85, -46, 28, -64, - 0, 3, 112, -16, -6, 27, -50, 21, -41, 33, 38, 65, 115, -27, -84, -60, -73, -118, -96, - -126, -17, 107, 32, -99, 86, -27, -45, -79, -87, 55, 13, 105, -103, 25, 97, 44, 22, -31, - 91, -91, -37, 3, -98, -25, 111, 51, 27, 41, 84, 96, -52, 36, -73, -51, 102, 55, 60, - -9, 82, 2, -63, -53, 45, 75, 34, -55, 8, -59, 45, 83, 18, 52, 23, -49, 114, 98, - 48, -31, -80, 57, -89, -58, -127, -63, 96, 42, -88, -60, 24, -90, -119, 11, 98, 44, 102, - -9, -48, -44, 83, 83, -76, -83, -55, 111, 89, -28, 1, 20, 116, 116, 38, 12, -40, -36, - -47, 66, 124, -119, -109, 113, 86, -64, -20, -78, 4, -59, -109, -32, -120, -106, -44, -40, -99, - 14, -19, 98, -16, 34, 110, 61, 24, 87, -51, -82, -35, -91, 122, 50, -78, -27, -54, 73, - -50, 54, 76, -43, 48, 118, 96, 63, 100, 43, 24, -104, -111, -106, -84, 65, -72, 41, 8, - 14, -33, -128, 65, -12, -27, 39, -66, -23, 64, 21, -108, -4, 113, 110, 2, -90, 111, -36, - 6, 90, -101, 69, 88, -78, -51, -64, 42, -15, 102, -26, -57, 50, 80, -76, 98, -109, 68, - -21, -57, -118, -111, 3, -36, -99, -13, -4, -101, -80, -104, 118, 69, 124, -77, -103, 116, 51, - -115, -80, -85, -124, -53, 40, -77, 59, -22, -43, -12, 103, 56, -24, -2, 99, -120, -102, -70, - 69, 0, 44, -42, -58, -24, 51, -123, 2, 40, -69, -116, -75, 9, 83, -26, 8, -91, 121, - 96, 81, -95, -27, -59, 56, 113, -18, -67, 117, -28, -94, 47, -47, -50, 126, -41, 103, -36, - -47, 98, -80, 95, -83, 35, -71, 82, -20, 39, 11, 25, 107, -47, -113, -28, 75, -37, 19, - -32, 115, -101, 88, 9, 16, 123, -66, -115, -64, -78, -2, 77, -36, -13, -45, 125, -41, 64, - -40, 8, 13, 95, 113, 26, 30, -1, -46, 54, -6, 96, 43, 120, -47, 121, -127, 102, 97, - 75, -117, -71, -37, -105, -97, 124, -50, 4, 44, 26, 51, 122, -7, -33, 14, -46, -79, 22, - 117, -34, -29, -65, 52, 83, 70, -92, 21, 107, -31, 71, 47, -83, 37, 13, -87, 126, -10, - 102, 122, -40, 33, -69, 47, -111, -60, 125, -89, -97, -74, -10, 50, 90, -118, 9, -104, 102, - -115, -52, 113, -104, 81, -38, 82, 80, -10, -52, -87, -69, 41, 29, 57, -10, -46, 95, -71, - 33, -23, -47, 106, -122, -90, -126, -29, -120, -120, -101, 80, -20, 118, -74, -16, 6, -52, 72, - 125, -68, 82, 87, 57, 86, 110, 5, 4, 10, -22, 118, -114, -9, 63, 124, -35, -32, 82, - 75, 16, -114, -82, 82, 97, 41, -13, -118, 25, 118, 115, -17, 118, 10, 36, -28, 49, 105, - -85, 73, -76, 106, 31, 105, 70, -117, 0, -107, 23, 49, 10, -27, -100, -49, -5, 23, -27, - 18, 2, 48, -87, -11, 3, -69, -50, 25, -118, 38, -30, 49, -49, 33, 71, -110, -73, -33, - -112, 16, -46, 90, 106, -30, 33, -28, -68, -56, -60, 86, 6, -119, 26, 61, -96, 19, -22, - -25, -126, 33, 65, -14, 28, -111, 107, -44, 1, -106, 70, -127, -117, 28, 100, -108, -99, 124, - 83, -63, -105, -73, -72, 123, -35, -40, 61, 77, -3, 119, -63, 68, 3, 105, -105, 13, -100, - -115, 104, 88, 115, -8, 69, 43, 25, 65, 33, 26, 27, -31, -74, 59, -103, -104, -64, 23, - -90, 57, 15, -95, -84, 121, -104, 19, -120, 59, -72, -2, 40, 116, -67, 8, 65, -31, -7, - -35, -97, 39, -49, -39, 9, 42, 59, -32, 56, 58, 58, -73, 22, 80, -19, 110, 122, 71, - 54, -117, 58, -110, 72, -87, -90, -29, 101, 92, -117, -121, 22, 39, -110, -22, -80, 91, -3, - 4, -7, 12, 15, -62, -128, -25, -91, -18, -117, 39, -28, 77, 125, -15, -88, -45, -65, 53, - 10, 121, 11, 78, -22, 94, 33, -32, 29, -41, 79, -2, -12, -110, 40, -43, -3, -57, -74, - -79, -105, 95, 16, -112, 95, 125, -105, 11, -122, 61, 29, -112, -64, -65, -65, -8, -75, -95, - 82, 46, -87, 18, 96, -73, -62, 64, -105, -47, 96, 56, 106, 36, 54, -66, -68, -32, -48, - -55, -24, -88, -52, -65, -91, -62, 48, -87, -106, -108, 109, -102, -114, -53, -24, -50, 119, -34, - 99, -14, -123, 30, 111, -60, -115, -59, -74, 2, 22, 44, 14, -6, 37, -24, 79, -108, 123, - -119, 56, 46, -42, 120, 22, 69, 76, 106, -118, 106, -59, -36, -61, 81, 35, -128, 59, -57, - -32, 94, -24, -108, -64, -64, 93, -12, -105, -22, 71, -8, 70, -35, 66, -127, 112, -92, 58, - 114, 4, -114, -111, 77, -80, 90, -24, -85, 40, -96, 13, -69, 38, -89, -90, 3, -38, 82, - 119, -93, -65, 88, 37, -119, 23, -61, -127, -12, 104, 52, -85, -73, 1, 105, 88, -106, 126, - 109, 3, 109, -30, -6, -80, -90, -85, -105, -16, -110, 56, 20, -14, 31, 8, 96, -126, -121, - 61, 117, 116, 22, 23, 19, 81, -54, 32, 30, 86, -118, 1, -57, 76, -46, -3, 84, -99, - -11, 86, 108, -96, 3, -71, -25, 14, 77, 30, -26, -37, -20, 78, -28, -101, 5, -102, 70, - 94, 96, -19, -110, 22, 33, 4, -8, -115, -72, -84, -100, -116, -40, -58, 23, 56, 48, -80, - -74, -17, -124, 65, -113, 14, 3, -72, 30, -108, -94, -23, 20, -25, 102, 95, 113, -92, -76, - -127, -40, 52, -114, -128, 123, -42, -118, -74, 126, -87, 111, -125, 47, -106, 115, -79, 61, 112, - 45, -68, -59, 88, -86, 108, 53, -32, -18, -98, 97, 126, 72, -78, -101, 54, 39, 34, 0, - 76, -59, 22, 69, -8, 83, 8, -18, 120, -27, -105, 26, -40, -38, 94, 77, -108, -42, -66, - 16, -32, -53, 98, 94, 114, 118, 87, -113, -76, 45, -30, -126, 28, -33, -7, 83, 59, -7, - -16, -18, -14, 6, -39, -19, 24, -111, -105, 85, 37, 61, -41, 93, 19, 35, 38, -38, -97, - -50, 83, 101, -60, -33, -124, 3, 85, 126, 40, 48, 106, 63, 2, 95, 83, 113, -36, 53, - -92, 48, -100, -8, -73, 107, 85, 41, -111, 48, 84, 85, -48, 13, 46, -117, 37, 19, -39, - -14, -113, -103, 33, -42, 115, -107, 45, 99, 81, -108, 75, -106, 52, 103, 36, 55, 63, 77, - 123, -68, -77, 15, 16, 22, -10, -68, -33, 11, -18, 117, 105, -64, -22, 6, -108, 99, -30, - -86, 96, 22, 121, -12, 104, -60, -86, -100, 98, -74, 68, -124, -50, -57, -25, 68, 110, 53, - -113, 29, -53, -111, -88, 65, 94, 104, 69, 71, -7, 35, 18, 95, 36, -120, 77, -29, 105, - 83, -87, 66, -110, -19, -47, 123, -104, -120, -50, -74, 104, 43, -119, -114, -86, -41, 116, -64, - 23, 21, -111, -65, -116, 21, 42, -105, -73, -94, 32, 117, 10, 121, 29, -128, -117, 65, -64, - 62, 84, 13, -52, 6, 108, -60, -111, 44, -29, 38, -43, 65, -100, -31, 11, 17, 55, -10, - -61, 75, -25, 75, -99, 53, 69, -87, -9, -55, -121, -16, 121, -99, -122, -7, -87, 69, 73, - -73, -64, -73, 60, -105, -41, -105, 65, -93, 72, 101, -109, 101, 47, -54, -15, -7, -39, -118, - 59, 29, 35, -68, 85, 101, -65, 43, 91, -91, -51, 72, -30, 64, -114, 81, -63, -86, -86, - -56, -41, 94, -98, 64, 88, -33, 28, -43, 113, 125, -85, 47, 126, 68, 27, 33, -50, 32, - 63, 26, -123, -84, 123, -110, 63, 51, -63, -74, -100, 120, 108, -119, -31, 126, -119, -115, 27, - 89, 69, 7, -39, -107, -113, 84, -84, -85, 110, -24, -1, 95, -26, 94, -63, 10, -79, 123, - -2, 126, 91, -89, 36, -77, 24, 122, 57, 84, -94, -97, -1, -36, -123, 54, -40, 2, 39, - 18, -50, -23, 70, -39, 2, -34, -12, -120, 126, 123, 14, -53, 9, 31, 37, -30, 121, -12, - 38, 91, -46, -122, 0, -84, -92, -52, 126, 42, -41, -71, 53, 59, 16, 59, 20, 20, 107, - -18, 105, 91, 88, 89, 0, -30, -105, -46, 25, -42, -102, -126, 30, 69, 82, 10, 30, 67, - -24, -39, 3, 69, -82, -117, 112, -108, 119, 34, -1, -47, 100, 122, 64, 32, 35, 87, 97, - 59, -128, -89, -26, 86, -54, 70, 64, 0, -43, 119, -25, -25, -111, 63, 120, -20, -26, -78, - 61, -100, 47, -105, -20, 17, 82, -20, -47, -55, -99, 31, 86, -110, 112, -10, -92, -7, -40, - -60, -111, -25, -19, 123, -93, -50, 69, -100, 67, 26, 28, -25, 70, -55, 124, 29, -49, 109, - -110, -91, -12, -64, -13, 65, 92, 9, 122, 83, 122, 15, -91, -97, -96, -106, 60, -1, 17, - 113, 45, -43, -98, -117, 23, 99, -23, 11, -53, 46, 58, -19, -54, 41, 13, -92, -17, -110, - 78, 121, -53, 113, -53, 15, 110, -69, -28, -125, 70, 114, 26, 116, 83, 43, -58, 60, 85, - -84, 32, -118, 35, 77, 64, -93, 60, -125, -69, 44, 27, 41, -31, 109, 109, -10, -28, 50, - 122, -41, 50, 39, -107, -68, 114, -13, -110, 76, 67, 51, -117, 21, 85, 6, -18, 108, -21, - 14, 87, 73, 71, 24, 58, -22, -108, -77, 51, 84, 4, 56, -46, -57, 9, -32, -67, 119, - 51, -2, 0, 78, -87, 18, 60, -82, 52, -57, -80, -1, -109, -50, -78, -94, -24, 0, 100, - 71, 58, -51, -56, 23, 28, -119, 90, -54, 46, -40, -109, -16, -84, -46, -40, -14, -3, 111, - 43, -88, 32, 126, -12, -49, 90, 0, 34, 122, -10, -93, -16, -94, 62, 59, 73, -24, -3, - 57, 37, 113, -6, 103, 102, 66, -26, -126, 96, -122, 59, -88, -28, 69, 82, 107, -64, -50, - 102, 70, -14, -48, -55, -128, 124, 72, -24, -48, -73, -8, 56, 49, 86, -95, -35, -63, -63, - -24, 15, 79, 51, 57, 7, -98, 100, 58, -118, 107, -52, 86, 108, -77, -51, 97, 11, 70, - 69, -102, 104, -9, -68, 108, -43, 41, 29, -75, 30, 47, 39, 23, -58, 37, 29, 118, -90, - 16, -83, -40, 118, 77, 46, -2, -123, -43, -80, -109, 109, 49, -54, 103, 116, -102, 21, -71, - -15, 33, -76, 44, -47, -125, 6, -68, 86, 6, -86, 14, -101, -45, -21, -41, -51, -12, 24, - -101, 73, -79, 42, -11, -68, 55, 54, -61, -127, 74, 114, -34, 67, 79, -99, 119, -59, 22, - 110, 66, 15, -60, -58, -85, 82, -40, 50, -88, 48, 119, 44, 50, -8, 68, -109, 35, -99, - 117, 7, -110, 64, -93, 67, 6, 3, 63, 122, -120, -30, 71, -71, -71, -101, 17, -18, -18, - -80, -121, -37, -25, 119, -89, 113, -69, -108, 65, 63, -103, 8, -83, -48, -62, -118, 93, -104, - 86, -117, -86, 18, 120, -23, 68, 56, 125, 66, 62, -101, -65, 2, -8, -94, -34, 22, -127, - -78, 37, -55, 13, -81, 109, -105, 80, -7, -71, 79, -53, 29, -26, 87, 97, 75, 47, 77, - -92, 32, 46, 16, 73, -116, 84, 87, 14, 12, 95, 123, -124, 117, -86, -19, 2, 61, 87, - -117, 62, -19, 106, 0, 90, -104, 63, 13, 87, 13, 35, -50, 57, 66, -34, 92, -21, -9, - 78, 108, -113, -88, 70, 118, -128, -80, 84, -24, 65, -88, 37, -113, 81, -112, 111, 112, 115, - 106, 41, 65, 94, 32, 27, -69, -58, -86, -118, 79, 67, -23, 51, 54, -1, -57, -92, 101, - -71, -71, 102, 63, -18, 76, -25, 42, -128, -117, -67, 116, -8, 114, -126, -34, -22, 12, 102, - -80, -73, 68, -2, 46, -72, -25, 18, 50, -87, -5, 45, 116, -116, -36, -32, -67, -74, 68, - 101, -73, -107, 55, 113, -101, -30, -100, -88, 37, 72, 23, -71, -37, -101, -106, 29, -11, 77, - -44, 62, -28, 37, -120, -25, 71, -92, -68, -19, 108, 65, 15, 28, -28, -50, -29, 0, 125, - 96, -15, -3, -52, -41, 95, 46, -10, -36, 72, 87, 59, 27, -107, -86, -83, -6, 70, 28, - -33, -47, -74, -98, 27, 10, -6, -89, -19, 81, -88, 64, -35, 79, 116, -120, -123, -40, -114, - 124, 115, -90, -42, 47, 55, 62, -34, -75, 32, 107, 45, 100, -64, 65, -29, 126, 50, -50, - -88, -118, 78, 13, -70, -53, -79, 113, -33, 84, 112, -117, -15, -63, -22, -10, 19, 72, 38, - -94, -16, -106, 25, -25, 85, 27, -58, -62, 14, -25, -43, 112, 70, -126, 65, -75, -93, 58, - -126, -60, 29, 116, 90, 86, -68, -81, 20, 94, 41, 78, -18, -31, 55, 87, 121, -62, 125, - -120, 3, -35, -26, -78, 87, 83, -9, -99, -49, -87, 18, 52, -108, -120, -89, -116, -102, -43, - -29, 90, 15, 71, 36, -78, -102, -29, 50, -32, -88, 105, -45, -76, -43, -68, 17, 6, -120, - 13, -19, -30, 78, -46, -65, -66, -110, -91, -22, -12, 82, 99, -67, 9, 38, -128, -123, -31, - -27, 19, -67, -33, 27, -46, 59, 92, -77, -111, -123, -101, 1, -58, 29, -86, 32, 117, -21, - 38, -87, -8, 116, 126, -27, -75, 32, 80, -117, 61, 1, -33, -71, 20, -48, -76, 87, 70, - 111, -11, -92, 103, -20, -41, 72, 113, 8, -3, 20, -79, -34, -79, -101, 24, -23, -85, -113, - 80, 45, 81, -82, 15, -119, 45, 45, 65, -97, -95, -63, 10, -67, -53, -25, 112, -125, 76, - 85, -24, -124, 63, 83, -36, -63, 106, 73, 106, -90, -19, 93, 74, 38, 108, 50, 90, 102, - -2, -15, 20, 99, 98, 8, 3, -25, 63, 121, -32, 46, 23, 4, 62, 87, 55, -44, -74, - 73, 17, 54, 106, 48, -118, -10, -7, 116, -75, 40, -55, -92, -128, 35, -43, -38, 34, -58, - 117, -10, -126, 28, 80, 101, -63, -13, -123, -10, 11, 76, 124, -1, 117, -9, 22, 112, 80, - 94, 94, 107, 17, 53, 75, 69, -32, 19, 100, -86, -107, -96, 116, 83, -43, -47, 99, -87, - 41, 73, -89, 36, 34, -45, -91, -27, 95, 73, -86, 122, -82, 76, 90, -2, -35, 12, -17, - 99, 21, -15, -2, 50, 14, 93, -18, 104, 57, -83, -121, -92, 115, -19, 114, -125, 112, -116, - -92, -89, -111, -95, 119, -113, 33, 64, -117, 33, 44, 26, -74, -28, -19, -123, 110, 125, -26, - 5, -105, -110, -48, -30, 10, 64, -21, 93, -22, -52, 126, 39, 65, -22, 8, 101, 104, 29, - 71, 75, -33, -69, -120, -33, 11, 47, 100, -109, 109, -97, -101, 3, 81, 89, -119, 65, -64, - -75, -101, 65, 32, -89, 13, 77, -99, -56, -82, 10, 81, 2, 53, 119, -48, -82, 82, 124, - 67, -91, -98, 100, -30, -95, -6, 23, 18, 94, -78, -83, -47, 45, -22, -123, -125, 17, -69, - 6, -38, 60, 29, 73, 54, 81, 68, 28, -100, -60, 85, -121, 60, 69, -26, 102, 61, 46, - 17, 99, 82, 125, 64, 82, 78, 57, 41, -98, 44, -88, 16, 65, 86, 81, -122, 120, -90, - -105, -80, 51, -72, -29, 50, -91, -52, 21, 80, 66, -36, 0, 103, 59, 88, 17, 60, 66, - -62, -17, -12, -20, 106, -58, -16, -103, -117, -21, 3, -45, -25, 53, -120, -80, -8, 15, 70, - 106, 89, -73, -120, -102, -124, -88, -61, -115, 56, -75, 126, 47, -4, -26, -58, -89, -104, 7, - -75, 3, -1, 57, 63, -47, -73, 108, -25, -55, 34, 48, -125, -127, -121, 6, 36, -80, 48, - 42, 1, 99, -80, 72, -15, 26, 79, -88, -61, 120, -54, 93, -41, 47, -26, 111, 45, -54, - -113, -72, 110, 7, -121, -93, -108, 81, 54, 82, -63, -64, -50, -69, -73, 81, -65, 104, -96, - 2, 78, -33, 118, 89, 89, 86, 53, -55, 24, 112, -122, 100, 14, 38, -25, 92, 23, -58, - 20, -113, -121, 25, -50, 99, -23, -96, -112, 39, 77, -79, -65, -116, 100, -108, 87, 102, -109, - 83, 70, 8, -53, 109, 109, 53, 61, -24, 28, 96, -34, -103, 86, -5, 107, -67, 106, 39, - -9, 95, -101, 109, -70, 78, 73, -120, -34, 59, 70, 91, 17, -83, 78, 51, 66, -85, 80, - -106, 126, -117, -53, 25, 40, -16, 89, -58, -79, -46, 72, 94, 73, 110, 20, -19, -4, -53, - -83, -87, 26, -3, 76, -54, -69, 72, 65, -44, 26, -98, -42, 81, 92, -121, 66, -26, 6, - -84, 11, -31, -23, -103, -90, 93, 126, 71, 80, 65, -3, 57, 79, -6, -52, -2, -43, 2, - 51, 114, 61, -5, -111, 51, 30, -11, 94, 79, 12, -50, -78, 65, 84, 34, -23, -55, -75, - -47, -100, -51, -45, 119, 0, 27, -3, 14, -54, 16, -4, 1, -83, 35, 1, 40, -81, 56, - 33, -79, -30, -49, 59, -90, 120, -65, -42, -119, 84, -91, 118, -1, 24, 118, -49, 63, 78, - 125, 81, -48, -33, 73, -57, -112, 8, 52, 9, -125, -32, 24, -86, 10, 70, 122, -68, 28, - -3, -11, 57, -110, 40, 120, 101, -60, 20, 74, -47, -4, -65, 79, -25, 10, 99, 99, -50, - 103, 76, -5, 103, -21, -93, 53, 8, -67, 21, -25, -126, -125, -24, -123, -69, -65, -42, 58, - 71, -43, -79, -92, 107, 87, 58, -117, -87, -79, -9, 18, 77, 107, 104, 118, -65, -37, -37, - 109, 102, 98, -26, -94, -30, 89, 107, 20, 13, 25, -41, 41, 6, 115, 112, -101, 84, 126, - -2, 23, -113, 118, 60, 39, 110, -20, 2, -36, 94, 123, -128, -62, 76, 55, -2, -61, -89, - 50, 63, -104, -58, 77, -4, 57, -90, 1, -98, 70, 21, 9, -27, -42, -99, 120, 84, -124, - -53, 30, -19, -111, -28, -114, -54, 112, -93, -86, 25, -92, -41, 8, -66, -91, 86, 21, 96, - -99, -38, 118, -87, 21, 71, 2, 111, 106, -27, 29, 65, -113, -115, 82, 65, 79, 26, -101, - 49, -78, -110, 51, 26, -14, -80, 112, -25, 55, -94, -33, 45, 104, 116, -91, -103, 14, 116, - -106, 98, -54, -101, 84, -113, 122, 65, 23, 83, 90, 65, 76, 82, -106, -20, 73, 57, 58, - -114, -43, 69, -116, 104, 45, -91, 94, 24, 112, -31, 123, -25, 48, -104, -115, -59, -94, 37, - -98, 85, -60, 68, 46, -54, -10, -101, -47, -49, -48, -21, 113, 55, -28, -117, 99, -117, -32, - -126, -123, -108, 70, 123, 120, -10, 33, 83, -111, -54, -29, 101, 43, 16, 26, -25, -83, 104, - -105, 117, 11, 38, -101, 32, 119, 124, 108, 34, 31, -31, 71, 92, 125, -46, -55, 72, -82, - -61, 125, 42, 50, 114, -96, -3, 105, 34, -32, -126, -88, 54, -85, -5, 7, -74, 7, 88, - 70, 100, 25, -7, 75, 35, -7, -60, 70, 47, -68, -87, 51, -29, -74, 30, 16, 86, -12, - -119, -87, -51, 77, 70, 88, -68, -77, -126, -88, 89, 76, -99, -114, -3, -67, -86, -4, -57, - 6, -48, 5, -38, -90, -128, -41, 73, -123, 38, 107, -73, -64, -104, -100, -104, -22, -51, 39, - -53, -49, 78, 2, 120, -53, -11, 115, -4, 9, -111, -78, -18, -17, 83, 12, 5, -38, 115, - 27, 63, 90, 0, -17, 74, 19, -26, -10, -100, -100, 62, -8, 110, -59, 28, -90, -104, -106, - 46, 34, 24, -38, -70, -125, -35, -11, -27, -46, 44, 126, -38, -124, 115, 67, 38, -34, -12, - 0, 15, -11, 47, 118, -21, 125, -82, 23, -61, 125, -4, 81, -44, -109, 60, 54, 5, 1, - 122, -43, 66, -123, -103, -31, -24, 45, -61, -75, -115, -62, 121, 36, -66, 108, 60, 43, 30, - -55, -24, -86, 73, -60, 120, 102, 5, -47, -125, 76, 38, -71, -89, -71, -82, 21, -56, -3, - 56, 5, 53, -52, -13, -26, 14, 14, -108, -59, -17, -52, 118, 126, -103, -86, -101, 22, -51, - 89, 1, -62, -84, 57, -7, -115, -60, -87, -7, -93, 88, -2, -31, 70, 6, -100, -37, -100, - 110, -71, 8, -69, 48, 120, -37, 84, 102, 16, -18, -92, 10, -103, 10, 59, -32, 124, 8, - -120, -43, -127, 125, -96, 84, 73, -32, -48, -67, -92, 59, 125, -42, 51, 101, 8, 83, 61, - 7, 19, 20, 2, 76, 52, -60, -79, -61, 117, 41, -2, 123, 88, -32, -62, 6, 91, -42, - 68, 18, 54, 119, -105, 37, -7, 62, 73, -121, -59, -123, -59, -21, -37, -77, -72, 13, -122, - 97, 69, 93, 8, 55, 100, -3, 87, 8, 43, 102, 40, -46, 40, 12, 28, -14, -74, -113, - -97, 7, 21, 57, -30, -57, -37, 57, -97, 91, 75, 96, 54, 67, 42, 114, 78, -58, -60, - 125, -27, -102, 15, 118, 87, -122, -14, 119, 14, -70, -125, -8, 118, 21, -27, -34, 10, -63, - -3, -7, 20, -2, -2, -85, 71, -90, 115, -91, -23, -114, -39, -26, 49, 5, 75, 32, -21, - -20, 33, 123, 95, 31, 41, 74, -10, 99, -30, -19, -75, -69, -112, 64, -84, 80, 105, 77, - 80, 5, 47, 15, -44, -11, -84, 20, 81, -69, 100, -44, -64, -2, 83, -73, -15, 109, 48, - 125, -103, -90, 50, -97, -54, -94, -9, 113, -94, -16, -124, -1, -82, 30, 91, -2, -14, -108, - 1, 26, 54, -49, 26, -34, 57, 56, 111, 111, 67, 108, -39, -69, 74, 12, -12, -2, -97, - 0, -113, -58, -61, 126, 13, 83, -86, 110, 12, -11, -95, 43, 80, -106, 55, -102, -68, -103, - 76, -124, 67, 38, 94, -17, 65, -25, 46, 8, 78, 117, -38, 36, -59, 95, -29, 110, -115, - -72, 27, -125, -41, -83, -58, 66, 13, 29, 72, -89, 1, -32, -89, 122, -108, 84, 15, -65, - -109, -103, -121, 4, -14, -5, -112, 87, -43, 45, -86, -47, 15, 6, 15, -119, 81, 3, -109, - -114, -66, 28, 71, -22, -59, 12, 10, 103, -128, 51, 58, -107, 12, 41, 45, 107, -11, 70, - -24, 54, -12, -79, 104, -121, -24, 121, 93, -27, 125, -105, 64, 42, -48, 88, 122, 64, 85, - -58, -57, -41, 63, -53, 116, -97, 60, -17, -28, -123, -82, 28, 105, -45, 90, -82, -16, 36, - 106, 63, 79, -38, -109, 63, 28, 60, -112, 117, -86, -114, -117, -109, 70, 118, -122, 51, -49, - -18, 106, 67, -110, 66, -37, -15, 106, 124, -118, 108, 41, -128, -75, -18, 80, -14, -123, -36, - 84, -5, 119, -61, 73, -118, 7, 83, 65, 36, -22, 39, 104, -62, 73, 32, 107, -89, -109, - 55, 103, -59, -66, -36, -91, 52, 117, 92, 69, -7, 75, -90, 47, -119, -94, -53, -78, 33, - 3, 68, 2, -95, -93, -116, -27, 46, 7, -96, -84, 78, 103, 15, 27, 95, 67, 57, -42, - 77, -94, -73, -33, 84, -24, 21, 34, -102, -5, 99, 4, -20, -15, -57, 117, -89, 88, -62, - -47, -98, -71, 31, -100, 88, 93, 30, 112, 38, 47, 71, -62, 41, 27, 21, -12, -48, -100, - 114, -124, -17, -123, -117, -10, 55, -117, 102, 113, 58, -121, 57, 51, 101, -54, 78, 53, -16, - -121, -32, -28, 88, -52, 0, -73, 56, -16, -45, 113, -21, -18, -94, 74, -21, -48, 71, -50, - -54, 99, 60, 30, 123, 34, -39, -70, -83, -110, -27, 38, -123, -28, -61, -110, 47, 26, 23, - 113, -27, -10, 20, -57, -97, -41, -58, -34, -24, 13, -62, 47, 81, -14, 103, -127, 66, -5, - -17, -25, 19, 69, 75, 97, -57, -30, 76, 30, -2, 96, 111, 28, 46, -88, 45, 33, 105, - 33, -26, 1, 82, -9, -70, 111, 23, 105, 98, 36, 30, 95, 60, -89, -4, 27, 77, -7, - 113, -37, 91, -20, -75, -5, -7, -41, -97, 80, 15, -86, -122, -72, 58, 115, 37, 11, -105, - 83, -47, 112, 92, -118, -4, -22, -51, -26, 34, -42, 47, 10, 40, -79, 112, -119, -10, -76, - 122, -35, -66, -120, 121, 25, 91, -4, -92, 39, -69, 49, 60, -72, 10, 3, 121, 14, -9, - 95, 123, -47, 117, -31, 14, 99, -81, -22, -94, -7, 105, 96, -76, 27, -87, -48, 32, -74, - 32, -127, -6, -26, -88, 52, 31, 73, -110, -9, -105, 119, -49, -123, 47, -90, -79, -43, -28, - -101, -27, 70, 36, 6, 89, 99, -83, -90, 11, -44, -30, -63, -8, 71, -29, -48, -12, -51, - -53, -121, -78, 55, 35, 25, 57, -74, -28, -110, -109, -66, 126, 37, -126, 103, -76, 57, -85, - -84, -91, -126, 103, -54, -45, 107, -22, -76, -67, 51, 29, -6, -107, -48, 71, -37, -108, 70, - 7, -8, -55, 83, 53, 1, 88, -117, -43, 41, -88, 49, 23, -96, 98, 61, 22, -66, -40, - -76, -4, -105, -19, 41, -58, 54, -86, -110, -46, -54, 108, 69, 63, 92, 20, 49, 41, -16, - -84, -66, -40, -111, 78, -53, 70, 93, 82, -81, -38, -19, -9, 90, -85, 84, -128, 19, -12, - 88, -6, 14, 0, -84, -74, 95, -94, -36, 108, -36, 86, 51, -29, -6, -52, -24, -99, -21, - 13, -56, -35, -118, 17, 106, -119, -25, -54, -85, -82, 0, -9, 49, 52, -85, -8, -72, -9, - -54, -11, -31, 23, 51, 111, 56, 85, -61, 67, -58, 20, 95, -112, 89, 64, 49, 117, 96, - -111, 19, -111, 106, 2, 64, -73, -14, -65, 0, -96, 51, 26, -118, -78, 112, -50, -89, -60, - 106, 6, -127, 32, -1, 57, -122, -10, -78, -52, 13, -15, -44, -65, -65, 42, 72, 110, 2, - -62, 25, 90, -77, 14, -37, -127, -75, 29, 87, 115, -25, -17, 64, -112, -14, -61, -47, -104, - -128, -29, -111, -45, 54, -24, 86, -108, -64, 45, 58, -124, -25, 23, 110, 82, 107, 124, -95, - -100, -125, -128, -56, -103, -44, 2, -28, 104, 96, 123, 121, 36, -83, 15, -73, -40, -125, 82, - 41, 60, 57, 57, 74, 123, -61, -52, 81, 68, -63, 83, 23, 47, -87, -32, -126, -106, 10, - -4, 98, -2, -39, 95, -20, -48, 69, 96, 7, -34, -95, -33, -93, 20, 68, -67, 98, -113, - 53, -126, -106, 32, 72, -38, -37, 74, 45, -12, -25, 117, 47, 107, -64, -24, -93, 92, -41, - 60, 32, -43, 115, -56, -127, 102, -6, 24, -30, 34, 38, -60, -98, -92, 105, 1, 9, -15, - -56, -60, 88, 66, -109, 122, 122, -114, 126, 89, -124, 61, -58, 65, 91, -81, -80, 58, -89, - -119, 57, 120, 38, 57, -66, 110, -98, -63, -67, -6, -51, 102, 17, 25, 42, -78, -75, 79, - 31, -18, 36, 107, 34, -76, -102, 46, 31, -77, -71, -85, 43, 45, 24, 6, 59, 24, -13, - -2, 46, 101, -80, -125, 26, 39, -19, -121, 123, 97, -16, 27, -53, 83, -101, -45, -62, -83, - 50, 27, 64, 108, 27, -10, 115, 47, -65, 42, 32, 21, 92, -49, -54, 117, -5, 85, -10, - 23, 115, -107, -21, -40, 26, -119, -101, 34, 113, -93, 42, 37, -113, 105, -25, 96, 89, 31, - 27, -2, -80, -65, -47, -110, -72, -30, 9, 47, -113, 88, 26, 37, 74, -108, 17, 84, -119, - -29, 54, 13, 1, -110, -79, 50, -115, 15, -50, 94, -47, 125, -19, -62, 36, -34, 53, 98, - -45, -95, 50, -127, 102, 69, -74, 52, -17, 4, -83, 78, 108, 92, -35, 121, -88, 35, 26, - -17, 36, 60, -25, -3, -80, -105, 86, 43, -56, -73, -59, 23, -60, -81, 27, -97, 101, 122, - 81, -106, 5, 111, 12, 16, -50, 9, 31, -79, -34, -88, -14, 104, 67, -14, -56, 56, -94, - 37, -27, -15, -36, -119, -55, 9, -25, 104, 61, -95, -48, -90, 24, -37, 8, 64, -91, 66, - -19, 5, 95, 29, 16, -73, 126, 8, 62, 71, -91, 61, -61, -47, 96, 112, -67, 45, 39, - -59, 107, -101, 88, 40, 108, 67, 59, 34, -100, 24, -62, -31, -66, 1, -115, 5, -36, -98, - -2, -13, -93, -125, 42, -81, -114, 122, 28, 29, -41, 93, 14, -125, -50, -9, 119, 89, -39, - -60, 17, -44, 101, -98, -53, -127, 93, 80, -17, 44, -85, -72, 42, 119, -67, 67, 11, 64, - -32, -67, -78, 24, 37, -124, 14, -62, 8, 59, 15, -46, 71, -120, -49, -64, -112, 24, -112, - -98, 78, -92, -1, -75, 104, 75, -8, -27, 12, -4, 32, -72, -94, 95, -38, 1, 84, 99, - -88, -103, -86, -102, -115, -44, -26, -98, -108, -94, 34, 45, -121, -107, -38, 42, 86, 34, -110, - 43, -54, -6, 101, -41, -83, 50, 94, -53, -17, -93, 14, 58, 94, 65, 104, -92, 119, -31, - 40, -96, -116, 60, -52, -5, -65, 69, -21, -58, 107, 109, -79, -85, 80, 62, 113, -119, 118, - 27, 78, 89, -16, -42, -116, -98, 71, -5, 70, -67, 125, 62, -55, 25, -124, 22, 125, 37, - 100, 89, 59, 61, -91, 62, 98, 98, -110, 20, 26, 124, -52, -4, -122, -113, -19, 40, 63, - -78, 11, 14, -13, -25, 22, -55, -37, -27, -98, 98, -35, -11, 51, 124, 23, 32, 104, -107, - 100, -6, -89, -77, -113, -26, 111, 91, 71, 72, 116, 30, -61, 99, 98, 107, -96, 8, 16, - 80, -52, 24, 29, -16, 30, 15, -45, -118, 24, 80, 4, -61, -4, -126, -3, 84, 99, 69, - -115, -54, -115, -72, 18, 98, 18, 65, -26, -22, 106, -121, 88, 110, -64, -76, 71, -46, -35, - 103, -69, 25, -94, -93, -53, -127, -32, -58, 85, 93, -110, 29, -63, -3, 3, -124, 16, 59, - 100, -110, 124, 29, 42, 61, -124, -94, 27, 56, 92, 112, -94, -18, 74, 98, 15, 115, 36, - -100, -39, 20, 18, -32, 100, 16, 18, -96, -94, -90, 116, 102, 78, 14, 48, -127, -63, -98, - -103, 76, -15, -8, 56, 87, -120, -62, 111, 5, -42, 16, 24, 73, 67, -20, -83, -67, -42, - 88, 106, 65, -17, 12, 124, -11, 69, -89, 101, 42, 123, -4, 2, -56, 87, -79, 34, -110, - 122, -101, 32, -81, 58, 75, -10, 118, -101, 42, -23, -92, -23, -128, 125, -114, -75, 82, -17, - -52, 97, 121, -58, -87, 125, 95, 59, -67, 10, -47, -105, 103, -2, -72, 99, -127, -99, -69, - 81, -100, 14, 2, -67, -110, -79, -54, 6, -54, -109, 12, 99, 115, -100, 61, -16, 107, 44, - -70, -37, 32, 87, 111, -6, 12, -113, -123, 96, 39, -103, -74, -114, 114, -52, -120, 82, 44, - 67, 111, -102, 122, -118, 55, 33, -38, 74, -101, -113, -58, -80, 20, -9, -110, -120, 105, -15, - 11, 33, -43, -32, 3, 23, 121, 122, 40, -1, 6, 21, -52, -93, 50, -5, 126, 63, 28, - 120, -66, 24, -32, 45, -57, -40, -65, -32, 21, 0, 83, 75, -117, -120, 61, 62, 35, 108, - -9, 66, -89, -93, 63, -8, 53, 88, -123, 120, 98, 64, -32, -55, -2, -95, -15, 35, 10, - -91, 92, 0, -50, 17, 88, 14, -126, -14, -46, -28, 108, -92, 35, -95, -14, -38, -2, 99, - 124, -110, -17, -44, 84, -107, 110, 38, -77, 32, -58, -99, 55, 73, 32, 10, 75, -54, 71, - 83, 12, 69, -111, -52, 6, -91, 111, -58, -43, 28, 80, -5, 51, 57, -112, -92, 80, -119, - -48, -41, 21, -63, 19, 57, -43, 12, -110, 25, 29, 43, 35, -124, -35, -68, -124, 93, -65, - -78, 12, 72, -120, -51, -45, 1, -34, 60, -114, -35, -59, 115, 43, 50, 62, 86, 56, -87, - 55, -74, -63, 99, -35, 72, -86, 124, 125, -53, 20, -84, -97, 109, -112, 86, 100, -64, 33, - -113, 6, -65, -79, 17, 68, 76, -6, -24, -7, 51, 97, -78, 86, -101, -106, -79, 43, -54, - -125, -46, -28, -103, 67, 10, -73, 11, -7, 86, -63, 8, 51, 17, -99, 115, 27, -79, 30, - 9, 85, 81, -55, 123, 96, 77, 109, -76, 16, -36, -119, 52, 47, -89, 71, 99, -113, 92, - 121, -111, 2, 13, -1, -17, -2, -68, 82, -95, 73, -88, -111, 34, 40, 21, 25, 72, -2, - 48, -28, 40, -69, -114, 112, -91, 45, -1, -102, 27, -33, -43, -23, 33, -6, 39, 10, -88, - 52, -74, -127, 14, -64, 93, 124, -62, 122, -25, -24, -114, 108, 2, -45, -48, -48, -15, -123, - -46, -47, -51, -30, -98, 114, -88, 126, -71, 115, -121, 9, 98, -102, -6, 81, -97, -17, 58, - -108, 69, 46, -120, -12, -4, 44, 73, 31, 22, 120, -43, -60, 119, 101, 53, -24, 55, 38, - -60, 9, -35, 79, -92, -125, 122, 88, 125, 18, -45, -6, -116, 5, 60, -85, -37, 71, -91, - -30, 120, 49, 118, -25, 119, 20, -26, -12, 71, 25, -54, -72, -44, -96, 33, 20, 4, 114, - 126, -24, -3, -42, -119, -84, 107, -115, 84, -30, -112, 72, 27, 41, 2, -114, -93, 19, 89, - -99, -66, -13, 20, -2, 103, 29, -34, 49, 82, 29, 0, -110, -6, 49, 99, -77, 124, 23, - -6, -24, -6, 75, 46, -127, -124, -106, 31, -56, -15, -33, 5, 108, 104, -21, -47, -40, -38, - -89, -111, -81, 0, 3, -22, 89, 3, 62, -74, 120, 64, -40, -28, 51, 1, -54, 54, 81, - -118, -32, 61, 6, -80, -94, 37, 5, 68, 122, -60, 92, -15, 67, 8, -88, 11, 48, -82, - -92, 18, 88, 41, -66, 59, -31, 4, -36, -84, -57, -23, -114, -42, 79, -40, -106, -77, 96, - 98, 5, 39, -34, 96, 84, -83, -47, 103, -126, -115, -101, 109, -124, -119, 82, -114, -32, 62, - 102, -60, 119, 88, 116, -58, 93, -12, 125, 44, 88, -49, -80, 52, 3, 118, 12, 41, -28, - -75, 70, -45, 71, 117, -15, 13, -52, -69, -106, -27, 44, 102, 62, 123, -110, -77, -25, 77, - 56, -74, -95, 54, 42, -6, 65, 18, -121, -60, 25, -127, -30, -83, 111, 26, 90, -9, -5, - 37, 22, -60, -94, -34, -105, -73, -40, -65, 12, 54, 110, -50, 2, -94, -14, 124, 40, -79, - 103, 110, 38, 89, -11, 31, -42, -122, -48, -48, -115, 102, -31, -128, 43, -107, 89, -115, 29, - -76, -67, 0, 87, 95, -74, 85, -8, -113, -110, 6, 76, 11, 21, 5, 36, 41, -18, -81, - 62, -126, -87, -36, -51, -117, 85, -21, 88, -124, 114, -19, -26, -103, -88, -20, 84, 91, 37, - -69, 115, -128, 118, -104, 120, 86, 59, -16, 57, 18, 37, -105, -93, 0, 17, 50, -86, -105, - -108, 84, 93, 14, -43, 5, 21, 89, 101, -39, 16, 23, 91, 82, 3, 3, -101, -104, -19, - -23, 26, -96, -42, 102, -107, -107, 110, -24, -5, 61, -12, -86, 104, 70, 14, 38, 103, 25, - 78, -62, 74, 33, -22, -116, 113, -68, -91, -7, -29, -104, 62, 23, 39, 20, 107, -94, -61, - -90, -54, -121, -124, 15, -4, 98, -29, 21, 32, 9, -54, -3, 102, 5, -73, 50, -20, 82, - 90, 9, 100, 112, 2, -75, -21, -111, -65, 62, -66, 59, 110, 87, -29, -60, -101, 116, 43, - -17, -27, -11, 93, 52, 82, 86, -100, -110, -110, 111, 43, -79, -119, -19, 88, 115, 18, 115, - 33, 57, -77, 100, 95, -112, 9, -64, 42, 54, -51, -15, 18, 38, 111, 8, -30, 112, -54, - 91, 47, 89, 81, -6, -107, -84, 101, 8, -27, -128, -60, 48, -121, 1, -65, 55, -80, -68, - 24, -49, -28, -14, 92, 53, -3, 122, -79, -123, 98, -71, -46, 102, 105, 55, 119, -71, 43, - 80, 73, 20, 118, 31, -98, 24, 108, 32, 90, 46, 103, -28, 60, -54, 112, 62, -26, 52, - 48, -32, 91, 124, -81, 95, -32, 5, 34, -23, 114, -25, 86, 87, -43, -30, -35, 82, -14, - 111, -128, 2, 125, -109, 62, -32, -128, 121, -43, -82, -115, 78, 56, 71, -94, 56, -104, 118, - -45, 51, 89, -64, 18, 103, -116, 118, -3, -9, 3, -89, -65, 69, 75, 38, -15, -35, -55, - 43, 71, 120, 122, -37, 36, -55, -54, -116, -77, -40, 86, -115, -42, 122, -62, -97, -93, -84, - -76, 96, 5, -13, -71, -77, -8, 95, -40, 32, 85, 18, 55, 25, 55, -42, -77, 17, 104, - -87, -53, -65, -104, -78, 1, 43, 109, -92, -35, 32, -69, -33, -52, 38, -51, -111, -11, -111, - 27, -1, -99, 120, -32, 117, -118, 85, 59, 55, -81, -117, -80, 24, -8, 71, -109, -24, 40, - 39, 64, -57, 67, -57, -114, 75, 91, 119, 42, -33, -106, 31, -67, -76, -29, 1, 51, -45, - -79, -74, 34, -84, -117, -56, 76, 1, -115, -16, -51, 92, 44, -7, 4, -126, 7, -23, 52, - -19, -7, 89, -22, 95, -109, 86, 20, -123, 5, -67, -100, -101, -109, -11, 104, 51, 36, 80, - -95, 44, -19, 56, -71, -8, -113, 28, -51, -9, -19, -33, -100, 38, 28, 38, 4, -59, 85, - 61, -101, 8, -51, 15, 12, -104, 81, -89, -104, -18, -15, 120, -76, -66, -47, -79, -39, -44, - 124, 61, -64, -8, 100, -55, 34, -124, 54, 79, 62, 37, 62, 34, -10, -14, -28, 38, -121, - -92, 86, -91, 37, 49, 91, 0, -27, -36, -25, 6, -115, 95, 64, -58, -40, 59, -7, 19, - 48, -76, -87, 25, 23, 31, -102, 38, -2, -40, -113, 121, -121, 8, 51, -39, 48, 53, -10, - -120, 117, -62, -125, -98, 51, 54, 87, -97, 105, 120, -4, -72, 11, -65, 60, 115, 41, -80, - 8, 57, -71, -69, -96, 5, -15, 81, 89, -2, 11, -97, 110, 60, -9, -11, -61, -70, 9, - -22, -67, -60, 67, -1, 30, 7, -69, -1, 83, -21, 112, 24, 56, 90, 111, -44, 13, -127, - 59, 110, -105, 23, 100, 87, -81, -75, 74, -52, -98, -115, -121, -38, 24, -71, 83, -126, -93, - -33, -80, 94, 2, -38, -16, 84, 54, 9, 0, 99, -76, -77, -99, 95, -17, 63, 100, 3, - 98, -116, 88, 17, 115, 60, -98, 43, 87, 22, 73, 76, -47, -2, -17, 90, -27, 40, 123, - -106, 16, 117, -58, 85, 113, 30, 90, 97, -11, 40, -15, 34, -12, -18, -63, -3, 95, 59, - -3, 46, 71, 58, 120, -109, -18, -92, -9, -92, -11, -85, 49, 72, 120, 46, -119, -118, 18, - 85, -48, -118, 123, -40, 36, 38, 59, -128, 90, 65, 23, -23, -6, 54, -28, -91, -34, 54, - -59, 76, 58, -16, -113, 88, 50, 105, -27, -6, 59, -125, -121, -99, 11, 53, -41, -84, 53, - -104, 11, -127, 115, 23, 64, 63, 82, 23, 20, -27, -118, -20, -105, -68, 110, 114, -122, -65, - 7, 80, -67, -39, -34, 112, -19, -57, 39, 63, -73, 61, 86, 62, 26, 69, 111, 64, 89, - 54, -85, 116, -44, -57, 97, -78, -55, -49, 107, -60, 72, 37, -101, -5, 6, -58, -126, -102, - 31, 87, 33, -66, 103, 96, 39, -70, 94, -107, 28, 115, 10, -43, -72, 93, -125, 51, -31, - -123, -95, -127, -97, 36, 36, 97, 38, 97, -122, -4, 113, -16, -109, 66, -42, 103, 33, -34, - 39, -120, 43, -85, 117, 100, 17, -52, 122, 79, 41, -106, 11, 118, -121, -79, -16, 21, -62, - -124, -124, 75, -51, 38, -61, -49, 35, -26, 39, 27, -95, 63, 50, -46, -37, 22, -17, -29, - 23, -18, 80, 34, 110, -84, 25, 95, -59, -109, -33, -2, -42, 39, 54, 99, 35, 61, -21, - -117, -118, 126, -78, 113, 3, -104, 97, -29, -5, -55, 77, 26, 83, -75, 66, 122, -14, -25, - -104, 58, -75, 122, 42, -96, -73, -104, 28, -2, -83, 107, -76, -61, -109, -49, -93, -118, 104, - 107, -4, -75, -98, -31, -36, 3, 59, 95, -95, -65, 92, 70, -105, -69, -117, -15, -36, -80, - -99, 41, 120, -58, 13, 94, -2, -78, 4, -79, -57, -95, -51, -16, -69, 94, 32, 6, 93, - -43, 34, -123, -112, -78, -63, -106, 85, 72, 31, -33, 64, 43, 9, 50, 84, 5, 62, 73, - 115, 54, -80, -14, -118, -124, 22, -103, -61, 51, 97, -8, -79, -78, 99, -27, -70, -82, -65, - -33, 63, -39, -36, -77, -109, 121, 97, -15, -3, -107, -27, 36, -69, -75, 97, 119, -103, -12, - 55, -41, 77, 59, -68, -110, -119, -61, 63, 16, -100, 34, 74, 32, -2, 15, -30, 15, -59, - -38, -96, -77, 12, 105, -18, -85, -98, 63, -5, 124, 22, -123, -117, -99, 26, 113, 113, -48, - 76, -50, -37, -117, -72, -18, 93, -115, 28, 84, -13, -62, 122, -45, 63, 49, -25, -50, -35, - 1, -63, 32, 118, 86, -18, 52, -29, 95, 57, -9, -56, 119, 0, -99, -109, 0, 22, 109, - 39, -113, 91, 49, -72, -107, 102, -84, -67, -4, -70, 23, -58, -19, 80, -82, -106, -47, 1, - 114, 63, -108, -22, 94, -42, 100, 85, -66, -124, 13, -59, 39, -80, 7, 110, 102, -29, 52, - 79, 0, 55, -40, -56, 88, 74, -50, -99, -80, 69, -74, 99, -67, -67, 96, -39, 124, -1, - 5, 58, 37, 82, -55, -3, 12, -90, 90, -41, -46, -96, 83, 81, 58, -80, -50, -112, 117, - -105, -65, -127, 35, -7, 122, -79, -38, 103, -9, -24, -108, -52, 95, -71, 72, -13, 93, -60, - 2, 39, 34, 106, 22, -25, 103, 43, 48, -37, 120, 34, 5, -19, 95, 20, 52, -74, -95, - -86, -25, -125, 46, -126, -41, -44, -86, 73, -66, -83, -16, 2, 53, -29, 117, 63, 24, 125, - -10, 7, 58, -86, -49, -56, 36, 29, 72, 83, 88, -17, -73, -114, 8, 11, -30, -96, 88, - -113, 73, 39, -71, 119, -81, -30, -22, -109, 61, -16, 57, -90, 43, 1, -52, 73, 70, 123, - 22, -16, -109, -56, 19, 89, 67, -110, -59, 37, -58, 3, 118, -92, -102, 103, -103, 15, 62, - 64, 45, 53, -47, -5, -54, -26, 57, 79, -106, 113, 126, 85, 2, 124, -34, 76, 125, -91, - -44, -108, -29, -38, -120, 58, -36, 66, 0, -29, 97, 105, -64, 93, -90, 123, -57, -60, -99, - -38, 31, -20, -63, -117, -6, 64, 25, -7, 55, 74, -117, 104, -48, -100, 66, 79, -23, 4, - 36, 65, -56, -120, 15, 69, 23, 19, -101, 61, -92, -90, -40, -78, -14, 100, -124, -54, 9, - -122, 72, 82, -110, -57, -42, 103, 108, -101, 103, -69, -17, -83, -78, 25, 34, 102, 40, 90, - -28, -3, 67, -91, -108, -96, -56, -33, 7, 119, 59, -92, -121, 30, -3, 17, -17, -90, 34, - 19, 104, 2, -31, -37, -66, 37, -21, 95, 71, -61, 43, 30, 37, -112, -54, -119, -36, 42, - -58, 43, -73, -24, 16, 58, 79, 117, 46, -101, -108, 104, -25, -93, 103, -86, 72, 72, -37, - -55, 101, 126, 73, -39, 125, 6, -110, 110, 91, -17, 62, -15, -98, 115, 89, 5, 47, 80, - -21, 79, -70, 115, -6, -89, -18, 57, 125, -114, 43, 10, 112, 28, 124, -125, 53, 106, 47, - -122, -44, -78, 78, 20, 106, -93, -21, 46, -45, -97, -12, 110, -57, -107, 2, -114, -40, 64, - -47, -77, -25, 95, -89, -112, -71, -63, -86, 42, 120, 89, -12, 52, -83, -59, -38, -63, -37, - 42, -6, -90, 72, 111, -74, 24, 28, 28, -35, 119, -98, -63, -54, 39, 6, 74, -86, -89, - 27, -5, -38, -68, 32, -50, -127, 76, 6, -59, 36, 24, 114, 113, -49, 2, -100, -32, 74, - 52, -88, 37, 48, 53, 116, -28, 91, -125, 51, 32, 74, 123, -121, 126, -1, -41, 113, 9, - 16, -69, -97, -94, -32, -97, 88, 38, 8, -50, 38, -39, 11, -40, -43, 61, 46, 117, -3, - -14, -10, 44, 28, -114, -55, 119, -14, -69, 93, 85, -46, 42, 99, 31, 15, -120, 31, -116, - -123, -118, 18, 77, -19, -41, -15, -122, 68, 41, -40, 49, 38, -25, -38, 74, -67, -44, 18, - 8, 80, -96, -16, 105, 125, -55, -46, -125, 57, 83, -49, -59, 5, 25, 18, -59, 60, 5, - -85, 48, -86, 31, -56, 98, 91, 77, 47, -71, 82, 63, 28, 88, -120, -128, 109, 112, 31, - 0, 44, -26, -37, 83, 66, -28, -83, 21, 10, 49, -55, 48, -126, -15, 63, -71, -65, 110, - -4, -15, 77, 58, 9, 83, 17, 72, -24, -11, 101, 97, 29, 68, 38, 100, -2, -67, -63, - -10, 22, -59, 2, 96, -48, 111, -77, -114, -57, 86, -40, -66, -100, 20, 92, -106, 18, -61, - 105, -41, -2, -70, 13, 105, 112, 23, 122, 18, 73, -81, -98, -67, -67, -118, 106, -120, 120, - 116, 20, -118, 0, -43, -19, -58, -20, -35, 30, -8, 65, -38, 63, 35, -119, -86, 93, -101, - -82, -28, -93, -66, -38, -112, -98, 98, -97, -41, -50, -60, 36, -97, 35, 69, 26, -27, 80, - -40, 17, -73, -84, -4, -93, -26, -11, -42, 6, 84, 82, 8, 93, 101, 57, -9, -71, -35, - 26, -127, -73, 70, -111, 30, 95, -58, 59, 7, 47, -120, 51, -36, 30, -66, 113, -117, 75, - -52, 90, 9, 3, -87, 22, -64, -96, -66, 43, 93, -103, -122, -74, -46, 12, 32, -82, -101, - 64, 61, 125, 15, 105, 84, 92, 76, 92, 60, -34, 72, -97, 2, 26, -14, 3, 5, -91, - -127, -68, -28, 116, 10, 80, 90, -90, -76, 93, 118, 9, -42, 17, -70, -84, 34, 101, -122, - -73, -97, -61, -5, 94, -25, 43, 109, -3, 26, -99, 112, 17, -59, 89, 97, -105, 75, 58, - 42, 50, -51, -98, 95, -65, 93, -88, -47, 4, 8, -117, 60, 31, 0, -19, 85, -1, 50, - 20, 56, -116, 125, -2, -12, 41, 98, -10, -58, -26, -58, 45, -87, -18, 103, 53, -36, 30, - -126, -65, -70, -87, -44, -18, -33, 59, -39, 62, 59, -20, 97, -126, 86, -24, -13, 76, -72, - -28, -31, 53, 77, 15, 49, 112, 70, 20, 116, 98, -81, 73, 95, 43, -82, 13, -92, -97, - 60, -7, 102, -23, -118, -33, 113, -94, -66, -100, -21, 23, 61, -120, -17, 57, 76, 28, -122, - -111, -50, 14, 21, -96, 102, -45, -13, 15, 46, 117, -18, -113, 14, 82, 116, 81, 9, -1, - -69, 118, -96, 39, -58, 39, 126, 121, 85, -73, -60, 74, -75, 2, -14, -2, -79, -90, 45, - -45, 18, 3, 32, 96, 14, 49, 90, -5, 66, 75, -109, -90, -67, -35, -9, 5, -107, -9, - -64, 44, -16, 16, -64, -38, 63, -36, -60, -25, -53, -29, 32, 45, -50, 96, 73, 82, -95, - -77, -69, 102, -103, -77, 64, -41, -40, -37, -122, 65, 49, 16, 55, 91, 106, 13, -102, 22, - 90, -3, -85, -31, 126, 96, 118, 113, -124, -80, 34, -88, 63, 111, 71, -42, -76, -76, -63, - 84, 18, 66, 63, -65, 72, -121, -73, -42, -104, 114, 18, -97, 85, -108, 49, -13, 8, 70, - -95, -52, -10, -14, 16, 103, -75, -51, 122, -2, 67, 120, -56, -53, -86, 67, -118, 16, -89, - -97, 79, 15, 71, 30, -123, -94, -52, -31, 74, 12, 70, 101, -88, -87, 69, -57, -13, 82, - -87, -74, -33, -45, 35, 125, 105, 119, 85, -18, 51, -20, -123, 109, 96, -65, 29, 122, -112, - 82, -67, -37, 45, 63, 75, 86, 95, -29, -72, 52, 53, 42, 101, -39, -28, 108, -83, 27, - -47, 92, 81, -9, -103, -65, -72, -61, 64, -34, -40, -93, 54, -60, 12, -10, 4, -21, 110, - 43, 23, -61, -23, -13, -122, -114, 116, -10, 88, -121, -43, -6, 110, -49, 47, 76, -55, 29, - -107, 60, -24, 6, 113, 12, 45, -31, 97, 80, 78, -19, 11, -57, -84, 125, -38, -47, -128, - 82, 109, -3, -117, -100, -114, -107, 94, -29, -39, 29, 113, -124, -61, -97, 7, 62, 106, -47, - 110, -108, 114, -28, 72, -57, 99, -97, -33, 14, -39, -122, -84, -59, -44, -127, 101, -95, -41, - 40, 110, 1, -8, 103, 104, -50, 71, -65, 91, 12, 56, 12, 3, -78, 3, 29, -112, -19, - 97, 78, -21, -67, 72, 84, 42, 19, 14, 96, 103, 37, -90, 76, 44, 67, 105, -124, 19, - 79, -59, -2, -74, -116, -110, 49, 25, 16, -94, -56, -120, -17, -119, -64, 111, -89, 91, 78, - 23, -90, 38, 92, -68, 66, 8, -107, -8, -20, -87, 89, 122, -7, 18, -116, -37, 11, 80, - -75, 104, -98, 102, -48, -22, 65, 117, -89, 104, -22, 33, 106, -92, 10, -17, 52, 67, -15, - 86, 47, -97, 103, 83, -78, 7, 32, 46, 46, -110, 22, 67, -35, -42, 40, -76, 8, 74, - 72, -18, 32, 35, 9, 53, -17, -39, -81, -27, -40, -92, -33, -82, -51, -57, -96, 81, -24, - -122, 4, 64, 99, -61, -90, 83, -89, 71, 5, -105, -22, -98, 109, 42, -77, 85, 20, 60, - -24, 75, -18, 126, -102, -81, -29, -108, -123, -122, -103, -69, -107, 60, 14, 11, 114, 77, 114, - 62, -24, 76, 116, -38, -71, 124, -87, 111, -61, 36, 30, 93, -75, -13, 28, -113, -21, -41, - 9, 6, 7, -10, 38, -15, -41, 43, 42, -12, -106, -59, 42, 29, 71, -71, 9, 124, 25, - -126, -57, -35, 120, 23, -105, -124, -77, -67, -117, 16, 31, -124, -28, 34, 30, -58, -39, -127, - 28, 93, 22, -38, -76, 106, -50, -32, 52, -1, -45, -86, 84, -37, -1, -24, -93, 55, 2, - 97, 93, 91, -67, 99, -73, -19, -67, -43, -54, -96, -112, 11, 71, -68, -43, 94, -103, -24, - -8, 108, -37, -114, 9, 8, 73, -80, -87, -102, 92, 69, -75, -69, 109, -34, 84, 60, 16, - 90, 59, 43, -53, -43, -78, 90, 115, -58, 66, 50, 41, 64, 67, -43, 57, -27, -26, 47, - 122, -40, -84, 118, -123, 35, 80, -53, 38, 123, -5, 64, -102, -84, 38, -42, -83, 68, 40, - -92, 49, -103, 113, -93, -60, -7, 58, 4, 124, 104, 38, -20, -83, -63, -12, 59, 20, -82, - -23, 106, 78, -87, -18, 80, 87, -38, 58, -98, -35, -56, -119, -65, 45, 61, -59, -120, -119, - 70, 49, -75, 34, -57, 51, -76, -79, -34, 74, -6, -55, -82, -86, 70, 10, 26, -119, -91, - 42, 107, -26, -50, -72, -104, -12, 66, -4, 21, -64, -12, 32, 24, -93, -17, 43, -123, -18, - -52, 97, -8, -109, 88, 27, -91, -94, -62, -15, -36, 14, -36, -114, 62, 3, -44, -10, 42, - -7, -47, 110, 118, -33, -24, 39, 87, 84, -30, -60, -73, -101, -127, 55, -68, -60, -23, 122, - -62, -17, -125, 122, -52, -116, 80, 111, 20, -45, -63, -76, 99, 107, -25, -83, -34, 111, 85, - 113, 84, -6, -105, 96, 78, -30, -113, 112, -83, 38, 40, -4, 42, 54, -4, 20, -126, -24, - 6, 3, 80, 84, 58, 56, 59, 61, 24, -36, -80, -107, -12, -30, 25, -56, -98, -125, -13, - 111, -38, 58, 81, -97, 63, -20, -88, -12, 21, -97, 48, -5, -84, 35, -64, 69, -35, 31, - -110, -27, 41, 84, 101, -47, -58, 39, 43, 49, 22, 91, 52, 87, 100, 96, -19, -26, -97, - 81, 56, 32, 103, 25, -62, -2, 19, -85, -12, 54, 63, 47, -39, -97, -98, -29, 97, -65, - -83, -43, 66, 110, -45, 31, -48, 3, 88, -42, -71, 108, -50, -61, 116, -127, 59, 22, 95, - -79, -39, -114, -11, -12, -14, 19, 125, -7, 80, -119, -57, 34, 66, 52, 40, -54, -62, 53, - -79, -66, -46, 92, 97, 78, -113, -32, 111, 29, -54, 29, 78, -120, 18, -13, -121, -32, 88, - 64, -90, -117, 92, -93, -52, 35, -49, -68, 82, -85, -75, 30, -78, 28, -117, -21, 58, 60, - -89, -67, 61, 55, 97, -72, 18, -110, -1, -36, -80, -72, -103, -75, -51, 31, -75, -63, -67, - 11, -102, -86, 25, 90, 27, -114, -102, 48, -91, -9, -58, -24, 8, -110, 109, 100, 25, 100, - -13, -80, -27, -35, 109, 46, 43, -55, -90, 84, 59, -19, -111, 32, 120, -89, 41, -110, 105, - 105, -122, -122, -92, -40, 37, 69, 5, -125, -52, 67, 110, -74, -69, 61, -36, 28, -51, -57, - 112, -36, 78, 42, 69, -3, 107, 103, 66, 120, 71, -82, 51, 77, 93, 96, -21, -93, -107, - -124, -15, 112, 104, 33, 69, -126, -79, -37, 14, 107, -45, 62, -29, 33, -55, -33, 59, -87, - -23, 76, -40, -42, -89, -35, -95, 116, -114, -30, 14, -20, -33, -15, 28, 111, 119, -51, 45, - 46, -29, -118, -24, -31, 32, -102, -119, -48, 108, 37, 71, -15, -68, -17, -96, 40, 31, 88, - 17, -63, 19, -31, -49, 113, -127, -58, -79, -85, 43, -94, -83, 100, 4, -22, 26, -78, -16, - 119, 115, -50, 5, -13, -108, 5, -42, 108, 46, 19, 47, 65, 8, 56, 20, 89, 118, 99, - -112, 70, -46, -114, 16, 95, -40, -104, -96, 40, -2, -7, 76, 47, -17, 122, 52, 101, -1, - 71, -34, -104, -120, -48, 12, 22, -65, 62, 93, -79, 119, -36, -14, 79, -53, -125, -120, -30, - 2, -124, -63, 50, -92, -107, -78, -128, 49, 33, 125, -48, 70, 38, 3, -30, -99, -58, 105, - -83, -109, 68, 91, 39, -11, 0, 40, 24, -114, -43, -106, 8, 12, 14, -111, 0, 113, 16, - 47, -104, 31, -115, 126, 73, -30, 95, 98, 98, -15, 34, -9, 78, -69, 62, -36, -30, -12, - -17, -63, 104, -26, 88, 87, -55, -113, -39, -29, -124, -16, -109, 126, 30, -73, 85, 123, 46, - -26, -81, 10, 30, -44, 52, 72, 114, 120, 98, 122, 27, 27, -121, 34, -50, 39, -36, -82, - -74, 3, -41, -62, 97, -106, 83, -94, 44, -46, 54, -4, 92, 73, 85, 27, 108, -10, -64, - 98, -94, -21, 15, 13, -91, 13, 68, 86, -84, -112, -4, -86, 8, 83, 25, -24, -111, -122, - 24, 20, 78, 89, 84, -103, -64, 8, -92, 119, 69, -112, 100, -68, -88, 27, -8, -67, 18, - 120, 1, 4, 110, -28, -12, -101, -15, -42, -40, -105, 24, -121, -10, -64, -103, -47, 116, 87, - -63, -68, -1, -94, -107, -97, -31, -107, -42, 71, -109, 105, 47, -39, 5, 86, 68, -29, 65, - -110, -117, 54, 75, -84, -109, -86, 84, -13, -107, 4, 34, 21, 44, -25, -40, -122, 75, 42, - 123, 118, -55, 77, 82, -102, 7, 61, 6, -38, -56, 93, 118, 95, 58, 3, 79, -60, 69, - 35, -33, 16, -80, 54, 115, 88, 7, 60, -119, 70, -107, -15, 82, 17, 120, 75, -49, 111, - 81, -26, -100, 99, 48, 76, 90, -45, -123, -23, 64, 26, 21, -22, -65, 106, -107, 26, 52, - -66, -115, 98, 79, 2, -72, 44, -116, -46, 86, 77, -65, 19, -116, -123, 45, -108, 28, 101, - -75, 36, 35, 15, -78, 85, -5, -82, -43, 52, 68, 88, 96, -1, 123, -124, -19, 32, -111, - 78, -127, 3, -107, -88, -92, -126, -101, -21, 90, -104, -73, 27, -71, 125, 2, 87, -4, 14, - 76, -126, -104, -22, -64, -7, -92, -37, 22, 41, -3, -38, 17, 20, -85, -53, -34, 101, -60, - 79, -98, -54, 5, -123, -33, -40, -50, -17, 107, -12, 60, -22, 118, -72, -6, -101, -116, 29, - -112, 71, 99, -54, 49, -104, 77, -128, 28, 125, -127, -98, 47, -4, -40, -11, -15, -20, 87, - -2, 41, -45, -83, -86, -113, -81, -127, -15, 50, 53, -104, -123, 54, 120, -89, -106, 87, 89, - 82, 40, 78, 62, 0, 118, 83, -72, 10, -120, 8, 1, 21, 71, -15, -34, -120, 39, -35, - -30, -125, -117, 113, 111, -45, 8, -29, -77, -11, -108, 125, -125, 120, -39, -98, 70, -123, 27, - -4, 63, 25, -89, 88, 43, -29, -86, 69, -30, -81, -88, -67, 74, 29, -71, 74, -90, 46, - 89, -95, 93, -89, -112, 36, 40, 7, -101, 27, 43, -89, 101, 80, 93, -24, 85, 4, -28, - -6, -51, -16, -27, -103, -65, -96, -32, 110, 1, -114, 110, -91, 48, 96, 69, 102, -84, 47, - 116, 26, -61, 19, -57, -68, -128, -27, 89, -125, -32, 4, 63, 29, -24, 44, 63, -41, -9, - -37, -27, -13, -89, -65, 44, -54, 42, -36, -86, -55, 119, 38, 97, 32, 91, 28, -78, -122, - 54, 98, -120, -93, -84, 101, 18, 70, 88, 65, -112, -124, 19, 62, 27, -65, -57, -24, 8, - -30, 42, -108, -116, 91, 17, 98, 113, 90, 75, -87, -38, -47, -96, -55, 86, 40, 44, -46, - -9, -16, 43, -73, 84, 47, -56, -47, -18, 78, 123, -16, 108, -128, 69, -66, 51, -18, 26, - -19, 15, -83, 19, -99, -18, -61, 85, 66, -104, -61, -7, -16, 83, 74, 39, -102, -108, -120, - -7, 76, -87, -43, -120, -113, -103, -102, -31, -56, -104, -14, 62, -59, -46, -118, -36, -10, 15, - -10, -9, 104, 26, -96, -42, 92, -86, 7, 116, 125, -32, -80, -32, 105, 36, 15, 105, 74, - -87, -102, 81, -42, -75, -28, 4, 76, 82, 96, -66, -110, 92, -101, -87, -79, 29, 61, -95, - 32, 114, 64, -95, -58, -111, -74, 89, -26, -114, -87, 47, 56, 86, 104, 74, -49, 5, 12, - -120, -26, 80, -43, -52, -16, -61, 71, -75, 102, -34, 17, 115, 18, -8, -89, -36, -108, 72, - 49, -56, 93, -93, 4, -21, -44, 39, 38, 78, 100, 29, 121, -35, 75, -117, -77, 56, 70, - -92, -77, 70, -70, 8, 56, 49, 72, -122, -23, -119, 31, -34, -19, 110, -18, -115, 6, -9, - -122, 31, -40, 49, 11, -76, 55, 44, -102, -35, 9, 28, 109, 24, -101, 6, -52, 107, -75, - 7, -89, 87, 106, 29, 107, 15, 24, 19, -47, -86, 45, 77, 40, -120, 41, 70, 95, -100, - -76, 3, -127, -15, 84, -3, 117, 73, -31, 92, 17, -40, 94, 37, -23, -9, 111, -31, 28, - 80, 125, 112, 111, 106, 95, 68, -28, -55, 23, 32, 101, 111, -53, -80, 18, -34, 61, -10, - 93, 57, -5, 0, -17, -66, -102, -125, 67, 68, 55, 60, 115, -53, 108, 10, 98, 78, -1, - -104, -19, -29, -37, 119, -83, 9, 69, -12, 121, -114, -32, -104, -106, -110, 46, 107, 31, 103, - -81, -115, -34, 18, 100, -56, -90, 102, 90, -8, -19, -74, 84, 105, -126, 66, 5, 29, 91, - 96, -62, -91, -112, 34, 116, -48, 119, -56, 71, -51, 79, 78, 111, 25, 17, 107, 75, 73, - -57, -111, -114, -71, 123, 15, -1, 53, -92, 79, -73, 71, -112, -6, -95, 24, 51, 102, 37, - -99, 90, -8, -61, -28, 62, 21, 43, -97, -38, 19, 85, 122, 58, -86, 76, -93, -46, -104, - 36, -26, 62, -15, 121, 105, -46, 95, 23, 27, -91, -55, 111, 35, -80, -16, 90, 122, 94, - -4, -19, -98, -119, 13, 68, -64, 100, 70, -115, 96, -71, -60, -98, 39, 99, 98, 111, -106, - 108, 8, 50, -88, -28, -97, 51, -110, -7, -86, 46, 30, 125, -105, -4, -63, -88, 57, -39, - -53, 42, 81, -7, -29, 37, -18, 9, -122, -125, -57, 84, 35, 28, 77, -88, 104, 104, -87, - -4, 37, -2, 75, 126, -93, 87, -36, -84, -57, -55, 64, 6, 108, -7, 86, 126, -17, -10, - 113, 102, -43, -94, 60, 53, 40, -82, 30, 24, -82, 5, -4, 104, 66, -22, -67, -115, 28, - 56, 38, 87, -2, 115, -63, 125, 112, -10, 67, -45, -2, -55, -31, -4, 76, -71, -96, 96, - -126, -97, 6, 45, 114, 116, 25, -79, -72, 41, -5, 54, -88, 28, -24, 45, -59, -78, 57, - -71, -43, -57, -80, 8, -40, -43, 98, 66, 30, 36, 62, -17, 113, -78, 32, 106, -73, -111, - 70, -85, 63, -50, 9, -104, 51, 117, 54, 37, -87, -126, -26, -73, 9, -41, -102, 16, -44, - -11, 40, 59, -68, -86, -74, 3, 20, -106, 46, 67, -111, 15, -45, -46, -30, 126, -126, 82, - 113, -124, -58, 23, 81, -54, 103, -2, 88, 96, -98, 12, -126, 28, -69, 64, 64, -14, 14, - -128, 55, -4, -92, 122, -31, -71, 113, -18, 121, 44, 58, 54, 2, -12, -78, -17, -91, 89, - 17, -94, 114, 106, 89, 71, -123, 101, -110, 45, 97, 46, 74, -16, -108, -108, -109, 77, 6, - -62, 86, -1, -8, -30, 111, 113, -66, -51, -79, -38, -90, 19, 58, -88, -53, 0, -43, -90, - 66, 55, -26, -124, 49, -118, -9, 69, 60, -27, -128, -39, -104, -117, -84, -49, 86, 39, -47, - -31, 43, 14, -17, -119, 94, 8, 24, 88, 90, 51, -116, -102, -78, -64, 4, 106, 34, -115, - -25, -9, 28, -11, 95, -100, 109, 116, 87, 11, 26, 28, 94, -12, 101, 106, -24, 47, 73, - -84, 57, 126, -26, 123, 117, -71, -27, 65, 44, -82, 116, 126, 24, -27, -37, 123, -23, 68, - -98, 8, -46, -99, -123, 46, -74, -16, 123, 42, 40, -106, 88, 98, 15, -81, 103, 100, -67, - 83, 12, 97, -27, -86, -68, -49, -39, -63, 70, 4, -45, 84, 51, -111, 50, -38, -32, 14, - -65, -97, 0, 21, 117, -3, -28, -31, 49, 101, 24, 38, 92, 24, 4, 118, 37, -83, 55, - -119, -105, -66, 43, 11, -117, 122, -78, 31, 59, 68, -81, -9, -125, 102, -39, -49, 102, -56, - -102, 17, -8, -5, -109, -108, -56, -95, 41, 29, -16, -119, -77, -14, -121, -119, -91, 50, -68, - -56, -83, -124, -108, -72, -23, -7, -114, 43, 93, 28, -66, -20, -14, -29, 48, -20, -17, -43, - -84, -61, -13, 9, 124, 64, -83, 77, -126, 0, 72, -52, -59, -46, -32, -103, -105, -102, 33, - 38, 30, -56, 56, 2, 59, -15, 83, -62, -14, 75, 10, 63, 6, -104, 99, 7, 97, -24, - -67, -114, 85, -23, -97, -124, -95, 60, -110, -122, -75, -114, -38, 100, -5, 117, 113, 9, -125, - -47, -124, 120, -53, 100, 92, -53, 11, -93, 109, -70, -100, -48, -60, 109, 117, 90, -22, 51, - -19, 7, 115, 53, -9, -28, 1, -68, 68, 51, -49, 114, 35, -108, 91, -55, 2, -92, -33, - 80, -22, -108, 72, -46, 7, 122, 21, -124, -119, 2, -35, -77, 125, -80, -119, -76, -31, 91, - 101, -20, 104, 29, -6, -110, 77, -71, -52, -109, -6, -98, -103, -11, -4, -59, -4, 95, -7, - -8, -96, -93, 17, 87, 29, 83, 126, -82, 16, 66, -110, 23, 104, 114, 17, -28, -104, -58, - -14, -70, -121, -70, 82, 78, -46, -37, 21, 103, 117, 100, -115, 16, -50, -47, -57, 52, -100, - -34, -81, -5, 0, 67, 54, 94, 39, -62, -123, 116, 18, -17, -20, 29, -73, -8, -14, 88, - -105, 52, 24, -55, 10, 26, -45, -89, -30, 46, 107, 116, 124, -53, -110, 7, 21, 75, 44, - -21, 62, 75, 11, -58, -20, 94, 29, -30, -127, -33, 47, -116, 66, -102, 29, 13, 20, 101, - -92, 77, -34, 90, -49, -4, 81, 27, 56, 109, 115, 12, -64, 71, -124, -17, 2, -88, -3, - -23, 67, 18, 29, 54, 35, 23, 120, 45, 102, -38, -59, -120, -66, -24, 52, 58, 10, -97, - 121, 90, 26, 49, -62, 19, -14, -109, 80, 45, 59, 75, -70, 0, 80, 67, 98, -23, -111, - 98, -52, -118, 79, -26, -102, 57, 60, 11, 15, 94, -96, 111, -108, 93, -66, 2, -32, 79, - -84, 17, 65, -84, -78, 73, 68, 33, 6, 14, 10, 10, 3, 23, 1, 2, 2, 14, 1, - 2, 4, 4, 1, 2, 5, 22, 0, 1, 13, 1, 2, 3, 6, 1, 3, 4, 5, 3, - 3, 3, 3, 3, 2, 2, 3, 3, 1, 1, 9, 1, 0, 1, 20, 6, 2, 3, 8, - 0, 2, 0, 6, 7, 0, 2, 2, 16, 6, 1, 18, 8, 9, 1, 0, 0, 0, 5, - 4, 0, 5, 6, 5, 1, 11, 1, 11, 13, 10, 23, 0, 6, 2, 3, 5, 1, 1, - 14, 0, 17, 1, 9, 13, 5, 8, 11, 1, 2, 3, 1, 2, 1, 1, 2, 2, 13, - 37, 1, 11, 14, 14, 5, 3, 2, 5, 19, 19, 15, 1, 1, 0, 2, 4, 1, 2, - 5, 6, 2, 1, 3, 9, 17, 0, 18, 13, 17, 3, 1, 0, 2, 2, 5, 1, 5, - 8, 2, 0, 3, 1, 2, 4, 7, 11, 19, 6, 8, 5, 1, 14, 26, 2, 4, 0, - 17, 9, 3, 6, 7, 7, 1, 5, 3, 4, 0, 6, 15, 17, 4, 12, 16, 0, 22, - 17, 8, 28, 10, 3, 3, 4, 1, 5, 4, 3, 17, 15, 1, 4, 6, 2, 7, 1, - 9, 8, 4, 2, 8, 2, 3, 6, 1, 2, 3, 8, 7, 7, 7, 3, 0, 13, 3, - 0, 3, 0, 3, 8, 3, 20, 6, 3, 4, 12, 3, 18, 1, 6, 7, 2, 2, 3, - 34, 0, 3, 18, 1, 10, 16, 3, 8, 8, 1, 1, 3, 1, 7, 1, 8, 4, 2, - 6, 8, 27, 11, 24, 6, 4, 7, 11, 2, 1, 0, 1, 3, 10, 8, 0, 3, 13, - 6, 12, 2, 2, 3, 1, 6, 7, 3, 9, 7, 2, 18, 1, 1, 2, 19, 7, 3, - 11, 7, 10, 5, 2, 0, 1, 2, 3, 9, 4, 30, 8, 3, 5, 12, 7, 0, 0, - 4, 2, 17, 2, 5, 7, 0, 8, 1, 14, 3, 0, 3, 22, 25, 3, 9, 2, 12, - 6, 2, 3, 10, 3, 5, 11, 1, 1, 5, 3, 16, 23, 25, 22, 1, 2, 0, 0, - 14, 8, 15, 1, 7, 2, 1, 22, 2, 8, 1, 5, 1, 5, 1, 2, 15, 8, 0, - 2, 14, 0, 6, 8, 4, 0, 37, 27, 0, 6, 8, 0, 39, 13, 10, 3, 2, 1, - 14, 8, 10, 1, 9, 23, 12, 26, 10, 4, 3, 26, 3, 1, 4, 1, 1, 7, 3, - 6, 6, 8, 6, 40, 1, 10, 5, 1, 1, 15, 1, 11, 3, 22, 6, 8, 3, 23, - 3, 1, 0, 13, 3, 3, 2, 22, 8, 0, 0, 9, 3, 25, 10, 6, 10, 3, 4, - 2, 5, 3, 0, 8, 12, 21, 2, 11, 7, 2, 22, 2, 2, 2, 5, 11, 1, 26, - 2, 0, 1, 36, 13, 6, 2, 4, 1, 0, 5, 9, 6, 4, 1, 28, 0, 3, 3, - 3, 2, 19, 6, 27, 2, 23, 5, 7, 2, 0, 12, 1, 1, 0, 1, 3, 27, 1, - 8, 0, 8, 1, 3, 42, -}; - -const static __attribute__((aligned(16))) int16_t input0_element_s16[] = { // 16 x 16 x 64 - -3986, -18351, 31892, 893, -30236, 18675, 24676, 17687, 12412, -32350, 9000, 17124, -1599, 23533, - -8540, 22435, -9632, 24383, -28076, 12995, -15037, -5890, 30224, -32313, 18280, -2694, -27880, -26650, - 18786, -30174, 25730, 10113, 15323, 26590, -6744, -1469, -1685, 17812, -15100, 29314, 26050, 21724, - 10862, 5503, 12920, -16828, 13134, -407, -10528, 4167, 15445, -22316, -5348, -21902, 29504, -13732, - -29850, -17062, -5513, -7514, 10753, -3265, 28628, 9587, -30626, -18002, 18264, -27695, -4995, -32525, - 17005, -18200, -7195, 16683, -25816, 3593, 25011, 20571, -29924, 18767, 24269, -11371, -32552, -28467, - -28396, -17428, -23147, -29951, 22724, 8178, 5905, 16346, 8351, -6530, -13105, 17344, 17591, 8126, - 19338, -16037, -4477, -431, -9715, -6863, -26401, 24852, -25257, 30575, -26878, 26006, 2732, -22901, - 4883, 14045, -14601, 9012, -13122, 32429, -19651, -24074, 30529, -16045, -19685, -17093, -20109, 32395, - 4598, 4366, 12510, 3553, -18691, -15581, -10974, 11549, 3796, -13230, -15870, 21630, 13702, -32034, - 440, 7952, 25099, 9623, 32541, 29722, 9903, -17075, -4975, 28359, -12567, 28885, 26817, 32532, - -19619, 2348, -20683, 22664, 23666, 23591, -11719, 894, 27959, 19039, 16952, 3966, 5516, -28859, - 3869, -28714, -1898, -26877, -10476, 305, 24146, -496, -10900, -21875, -10570, -20961, -6457, -15816, - -11284, -20840, -5722, 27397, -17841, -31299, -15222, -17383, -11847, -4675, -13718, -30112, 13540, 5042, - 5289, 9039, 11021, 26511, 19055, 30482, 21074, 4860, -20541, -8443, 18781, 10881, -29560, -8890, - 15308, 18204, -6704, 15311, 2729, 18696, 18509, 18159, -24346, -1400, -3176, -23113, 13809, 5538, - -17995, 203, -17666, 15943, -14235, 6243, 10685, 10592, 27215, -5445, -4347, 5722, 29849, 5515, - 23012, 29544, -24808, 6527, -5082, 13599, -26193, -15646, -17732, 4034, 8071, -14033, -17980, 11272, - 6046, -11074, 21264, 9195, -5888, 15746, -3677, 9558, -1251, -9971, 9232, 18831, -10293, 28235, - -15532, -20772, -10210, 7326, 22309, -30340, 14958, 21746, 23307, 12655, -3459, -7254, -23595, -11155, - -26213, -24741, -13229, -23295, 13467, 15887, -4780, -23020, 2082, 16639, -20004, -10826, -11763, 31856, - -8735, 29521, -6585, 19405, 25826, -30544, -24000, 10639, -25205, -22856, -19173, 7411, -4905, -13945, - -16239, -16730, -27201, -22283, 1414, 20447, -23923, 5473, 4746, -12319, -17580, 19043, -16216, 29934, - 25427, 15156, -5368, 1841, -1994, 15579, 22817, 13753, -24052, -28677, 5359, 4072, -10466, -26124, - 29730, 13853, 30000, -15151, 29768, 2980, -13869, -15445, -21581, -6662, -17297, -4279, 4269, 14310, - 29836, 23387, 9963, 15563, 7670, 13255, 19861, -1218, 9901, -10565, -8629, -26823, 3316, -14238, - -5801, 6380, 24198, -5891, 13475, 8983, 17860, -11688, -1386, -19998, -9716, 21060, 1617, -13009, - 28318, -223, 30978, -6325, -10242, -23407, 12581, 28945, 16098, -32578, 18102, -7058, 6220, 23885, - 27556, -18416, 16640, -11910, 5160, 24282, -13212, -1674, 13145, 11043, 6286, -11634, -9525, 8266, - 12243, 2819, 9879, -4218, -3025, 23393, 30430, -9713, 3752, -11530, -19371, -28827, 22066, 11830, - 5838, -32166, 10752, 13896, -18150, 11142, -28103, -13629, -681, 4767, -26362, -29294, 32617, 12060, - -9700, -9614, 14494, 3847, -27355, 2406, -28276, 20350, -7900, 6439, 7967, 6605, 8534, -17071, - -8069, -9799, -4491, -10954, 24705, -3413, -16045, -5043, -20333, -14399, -19172, 23168, 28771, -30118, - 5728, -22951, 15909, 28100, 24300, -17567, 6101, -12322, -15301, -14465, -15228, -11832, 24596, 2928, - -21138, 23607, -12176, -24561, 21465, -30487, 26193, -80, 7479, 7768, 17893, -14949, 6895, 13938, - 21817, -6300, -19235, -6222, -5335, -10262, -4195, -23408, 7322, -23927, -6979, 32081, -6382, -31022, - 21566, -30752, 8067, -20161, 27432, 29297, 26694, 12625, 26736, -29680, -12489, 19382, -6118, -27563, - -33, 1248, -30164, 30570, 10443, 17081, 9293, -3791, 7912, 30038, -22987, 27484, 26609, 19043, - 14176, 12155, -3451, 31872, 23874, -31256, 22526, -4351, 27069, -29251, 19209, 13572, -30486, -19678, - 23747, 28022, -21143, -28006, 16115, 28748, 29398, -11672, -14678, -1684, 14456, 18026, -532, -14842, - 11737, 31551, 8032, -22313, 6803, -20844, -23422, -16235, 20714, 19866, 3171, 29672, -23704, -17462, - 18869, -18873, -32182, -24887, 25124, -22433, -26863, 3372, 15286, 31062, 7473, -7031, -21231, 28964, - 8372, 31682, 28007, -16969, 22321, -28215, 1568, 16350, 9186, -18868, 28937, 18008, 19539, 16884, - 28770, -4705, 18777, 5826, -12962, 5724, -12828, -30068, 10686, 15691, 22527, 13674, -32601, 27777, - 19369, 16013, -12767, -9087, 4666, -11796, 16448, -6067, 28734, -10564, 27544, 27041, -2304, -8205, - 24056, -22061, 13852, 5504, -24827, 14269, 20753, -26413, 18072, -32505, 8273, -30519, -2699, 16428, - -28545, -28749, -1704, -29922, -28157, 8474, -20233, 2005, -31524, 15274, 6557, -5715, 18696, -6020, - 4495, 31184, 8552, -26391, 9184, 3655, -27521, -4137, -2430, 6319, 27798, 20037, 28814, 10187, - -4894, -18903, -15745, -29090, -14460, 4736, -18586, 26864, -31395, -4056, -29715, 22944, 16928, -20082, - -32210, 16141, 27944, 23131, -2460, -4094, 14428, 12554, 10418, -15407, 19468, 12585, 15856, 5928, - -14068, 20697, 8344, 18256, -23136, 10649, -13543, 21883, -23991, 2813, -31595, 22154, 18013, -20596, - -29589, 28314, -4912, 12498, -9956, 26370, 8310, -26532, -2153, 23739, 30564, -1198, 13177, 4825, - 21859, 26609, -844, 23477, 22957, -12603, 16294, -8652, 14946, 632, -19463, 10968, -26974, 20277, - 12301, -3475, -23372, 27422, 32718, 18080, -6635, 20598, 27110, -31493, 13231, -27265, 26171, -30613, - -5351, -2010, 25046, 21055, -9177, 30114, -20279, 10280, -24787, -27029, -23624, -17886, 15168, 24061, - 19478, -19894, -18574, -2949, 30419, -23235, 18890, 23914, 32568, 22113, -21061, 24318, 9129, 17007, - -1718, -12812, -3952, 5205, -28203, -1252, 23724, -28555, -1338, 7442, -28592, -29278, -28567, -23547, - -23369, 3920, 23900, -12407, 1160, -10123, 22556, 14362, 1587, 16803, 18658, 3556, 18090, -14325, - -11876, 31794, -12207, -20040, -8586, 24448, -16024, 18388, 15801, 22062, 17140, -17830, -19513, 26364, - 25251, 26804, -10336, -4212, 16398, -26380, 28770, -5038, -17259, 6230, 7052, -2330, 32687, 29728, - -12974, -3658, 28646, -11336, -30167, -4847, -9061, 12738, 9222, 24596, -26077, -6610, 1534, -15761, - -3159, 6414, 10715, 6702, 22791, 11972, 78, -9419, 19393, -3027, -14702, -17179, 32434, -22209, - 21888, 4616, -9172, -23809, 2830, 2638, 6273, 6326, 4590, 28383, -19813, 22209, 23785, 12665, - 28338, -18652, 5385, 877, 8013, -9270, 17320, -9457, 24628, -13404, -27966, -21901, -2551, 10278, - 14400, -9576, -2638, -7236, 25422, -25896, 15462, -10173, -32202, 26329, 762, -18481, 28754, 31043, - 25834, -27121, -22156, 20544, -16952, -7526, 17340, 12535, 14403, 12350, 13808, 11333, 27431, -9954, - -26588, -3493, -10643, -23033, -23186, -9825, 14139, -11847, -13658, -17547, -24413, 16072, -27193, -23317, - -19089, -26572, 29005, -2270, 15063, -7734, 20915, -14198, -23693, 30055, -25653, 9620, -16574, -28678, - 6951, 8399, 3383, -29630, 17816, 4351, 32621, -8710, -23022, -25182, 14749, 9413, 6349, -13716, - -434, -1724, 6466, 15920, -10459, -22997, 3555, -31373, -32723, 16995, 24746, -6477, -24667, -12859, - -21071, -7233, 28692, -4835, 27208, 23865, 17233, 8751, 20021, 1764, -7838, 29480, 22009, 24235, - 5711, -3435, -16975, -2632, -8206, -14568, -26671, -21401, 18399, 27589, -14968, 32236, -31001, 26327, - 3344, -23018, -24206, -9255, -29205, 1831, -30383, 14187, -17787, -21801, 727, 4051, 26817, -31643, - 25822, -7174, -4863, -17570, 15222, -17376, 19992, -23903, 12204, -30124, -13604, -28670, 9165, 18384, - 3189, 17002, 26689, 24505, -27995, 17734, 2239, -23489, 24162, 25465, -12631, 21779, -29246, 21258, - -11541, -27080, 16340, -22016, -7979, 21585, -1775, 2368, -8712, -5787, 16808, 7687, 13145, 430, - -31082, 18539, 11358, -6550, 3958, -32122, -18461, -23170, 27791, 3911, -21957, 22581, -30874, 26791, - 25106, 363, 815, -10958, -1555, -32127, -24746, -30268, -8377, 20423, 1568, -24667, 26243, -1826, - 32024, 19259, -27694, -15813, 19301, -8308, 12307, -19786, -12426, 13830, -17292, 12872, 13573, -21838, - -2400, -29330, 27262, -23176, -13701, 4728, 12120, 7588, 18762, 18680, 30110, -1674, -5260, 12010, - -5257, -24646, -7641, -5242, -30139, 14908, -30780, -20198, -11680, 14994, -2464, -31341, -3170, -27670, - 2614, 24262, -24130, 5771, -21986, 16239, -5562, 23490, -13003, -26143, -28440, -16320, 26268, 9062, - 3609, -24665, 990, 23358, 23928, 25043, 9660, 23005, -1286, 16483, 1953, 22055, -8720, 4718, - 3460, 10727, 3989, 28916, -16053, -26181, -10152, 14085, -11675, -20227, -16414, -2278, 32751, -16691, - -23221, -16133, -22322, 8704, -5670, -4173, -19943, 10397, 10757, 25571, -25656, -27576, 686, 25401, - -11027, -4113, -16820, 9180, -6141, 15397, -2223, -25821, 23148, -7872, -6218, 22497, 16505, -682, - -7102, -14164, 8892, -7008, -28947, -22249, 14272, -21651, 568, -13298, 10552, 26322, 29300, 32115, - -24058, -17368, 28454, 6996, 9340, -3111, 7733, -13746, 19500, -13661, -9228, -19470, 4387, 28657, - 26691, 29131, -13916, 18357, -25568, -26892, -22153, -11882, 27754, 24094, 1244, -8066, -21712, -2809, - 26769, -1554, 4713, 15720, -17987, 24961, -19353, -1939, -2603, 32196, 31993, 19599, -6642, -31434, - 16310, -23182, 5278, -29865, 18471, -13208, -8383, 15726, -23907, 6336, 22124, -30515, 12747, -27826, - -17148, 24860, -9917, 12035, 27884, 26486, -23307, -7112, 24851, 12792, -2495, 10923, -17141, -32695, - 2565, 16320, 2554, 24161, 32248, -9325, 21036, 31560, 617, -7507, 25017, -30458, 18785, -18281, - 24747, -21859, 14682, 2337, -31106, -16724, -8919, 18274, 4236, 24155, 437, 17068, -18989, 7250, - 23339, 15545, -10219, 9223, -17771, 28621, -21690, -26055, -12682, -32199, -25895, 31130, 14941, 24562, - 27910, 15187, 3641, 11617, -16905, -21355, 26522, 21886, 15151, -31662, -20639, -28594, 3713, -8574, - -14008, 19004, 14619, 21007, 30399, 10318, 30924, -19831, 16686, 3661, -30905, -28339, -12815, 1884, - 27278, 5112, -2019, 26518, -14035, 31251, -30438, -32686, 10502, -31704, -15659, -8126, 18689, 31880, - 7323, -11464, 28559, -9330, -10214, 32738, 30029, 32590, -28340, -5868, -10186, 26250, -29776, -25041, - -10523, 8502, 21697, -3183, -32333, -11959, -16728, 10506, 14318, 275, 5864, -31947, 28635, -20976, - 29943, 13707, 29904, -8673, 28551, 21784, 23859, -20925, -30938, -20036, -6562, -15657, -20932, -28348, - -23413, 18090, -8475, 3341, 29472, -6506, 4350, 21327, -11024, -896, 30372, -25082, -3822, 4948, - 25547, -21676, -32072, 32436, -12971, 24378, -21537, 14866, 6199, 16593, -14246, 25515, 15628, -14208, - 8281, 1676, -27991, 15016, 6623, -3086, -2153, 15488, 9614, 9161, 4379, 10426, -31212, -7305, - -31165, 17425, -346, 31066, 14580, 21957, 22548, 3073, 9457, 6411, -29463, -31391, -14673, -4956, - -280, 25492, -7606, 17417, -7221, -4145, -13507, 13158, -27791, -2253, 15510, -8203, 10693, 23216, - -6977, -24658, -10469, -23638, -30981, -29428, 16142, 5794, 24581, -20245, -27401, -16240, -22826, 15975, - -1085, -31992, -27489, -2031, -20115, 7140, 25861, 10066, -28163, -32373, -32013, 30053, 27333, -27079, - 863, -27326, -10294, 14709, -13421, -9829, 25585, 11264, 19246, 26411, 353, 24190, -27654, 17017, - -24544, -19559, 12053, 10414, 23859, -16126, 9264, 27921, -5566, -2711, 5244, 30758, 2927, -1436, - 15844, 26073, -2200, 7621, -5971, 1604, 20907, -2845, -9282, -21362, -8773, 24796, 30496, 23966, - -5427, -5819, -2906, -4045, -10637, -5111, 10301, 16445, -10642, 28934, 9812, -21416, 672, 15929, - -27238, 18313, 9119, 21652, -21249, 28197, 3832, 8825, -1370, 18208, -30599, -18455, 7876, 23297, - 5207, -14865, -754, 26786, 3718, -27376, 26290, 22576, -1821, -16107, -23212, 1383, 12821, 20243, - 18533, -20404, 2520, -25838, 18283, 5084, 21125, 8398, -19649, 16656, -15853, 28771, 2323, 2932, - -2100, 22089, 22261, -19678, 8218, -9861, 24910, -21725, 21194, -6132, 222, -1879, 17753, 3575, - -22784, 5210, 2631, 11928, -6862, 27651, 11239, 14595, -1195, -19733, -3712, -18043, 131, -24028, - -779, 31661, -25283, 17474, 8855, 13871, -4671, -24444, -16944, -12317, -31960, 32699, 14787, 10414, - 11679, 15896, 4686, 3553, 26150, 22302, 23332, -32115, 6035, -10198, -8291, 4192, 6597, -190, - 20726, 26417, 11517, -16495, -28822, 25426, 22802, -19566, -25282, -29946, -6652, 10250, 25089, 3319, - 7926, -27255, 31166, -12205, -6829, -29544, 10110, 12065, -23218, -40, -13955, -16036, -11576, -289, - 32398, 1420, -18649, 21416, 15596, -12178, 22484, 21150, 7580, 18579, 27093, 2117, -15307, -9159, - 19355, -2812, -13535, -31742, 30197, 18958, 10951, -13325, -13098, 22970, -8622, -15531, -19952, -22061, - 25240, -27451, -14841, -25202, 25706, 6165, -31992, -23053, -11897, 7464, 17684, -18593, -16023, 2847, - 32613, -32146, -18131, 2326, -20535, -9338, -25000, 32713, 23329, -13773, 7386, -6564, 30977, 13003, - 31118, 5661, 14947, -10565, 23320, -21474, -22076, -17030, 1936, -939, 8239, 29272, 21654, 603, - -1877, 26498, -26059, -21325, -15281, -5531, 25655, 4520, 18278, 9785, -5839, -5068, -14142, 2669, - -22301, 12894, 22193, -24107, 2262, 18409, 16938, -115, 25205, 32379, -7000, -4374, 30825, -10303, - -26013, -4610, -9757, -5796, -27512, 27553, -9409, -18984, 17641, 18558, 4503, 19717, 5354, -10203, - 27955, 15249, -32489, 4217, -13108, 4059, 24164, -20115, 28531, -5634, 24801, -11318, -23979, 16615, - 27848, 19840, -29716, 15874, -5425, 18051, -4069, -19937, 16058, -2594, -16568, -19847, -26764, -7032, - 1842, -30366, -15941, 16585, -16694, -18431, -27817, 18648, -22610, -2836, 2903, 15619, 27105, -9603, - 11744, 9316, -5006, -5119, -22596, 10303, 12948, 27546, -21220, 31569, -23753, -29622, 31824, -17508, - 3657, -8332, 16764, -20457, 26875, -5366, 31825, -19466, 2216, -4336, 15504, 18336, -23132, 12783, - -12734, -30864, 30960, 26666, -10447, 457, -9576, -13044, 7478, 2448, 27347, -23939, -24126, 7663, - -28014, 19433, -14406, 17507, 31863, -11601, 30239, 15078, -18496, -5625, -25849, -22472, -9255, -30691, - -6526, 1814, -15208, 30749, 26573, 29846, -3931, -17736, -7000, -4493, 1580, 839, -4525, -13829, - 10969, 741, -4887, 28481, -14078, -5890, -29347, 8059, 20016, 5103, -30964, 13905, -5800, -32581, - -27863, 15369, 32694, 4358, -10762, 17019, 14287, -14548, 30645, 17233, 14325, 3586, 3011, -1980, - 24342, 14794, -24201, -6751, 384, 23884, -6111, 5215, -16807, -13367, -14607, -11990, -9748, -11276, - -7348, -15188, 2933, 32347, -20760, 18869, -8521, -8716, -21305, 8403, -26562, 9816, 20291, -31745, - 4427, 31497, 32369, -21736, 21023, 25761, -20693, 27206, -29427, -12268, -1402, -18478, -18343, 23363, - -15671, 29810, 23559, 1685, 30799, -27876, -31864, 10900, 16229, 25743, -10089, 6894, 28128, -7035, - 14413, -2313, -12420, -11153, -28294, 15065, 16746, -31947, 11900, 7240, -4083, 23539, -28245, -30421, - -11979, 17386, -30130, -26343, 13813, 6534, -21234, 11505, -1047, -14462, 6887, 13524, -5748, 27087, - 20918, 3692, 7328, 29990, -28552, 9767, -32456, -25877, 25294, 27101, 22987, -11392, -13758, -10109, - -22642, -15530, -9281, -27230, -18184, -1639, -14853, -31525, -11159, 3414, 1726, -18209, -27889, 22047, - 1065, -31514, -20299, 29888, -31214, 22604, -7219, -30444, -12882, 6129, -5561, -7485, 14430, -3713, - 10752, 30843, -24918, -16471, -23334, -8727, 16420, -15758, 29036, 12529, 23644, -10331, -20840, 10514, - -18161, 25106, -3173, -18933, 31086, -31023, -10854, -29985, 2100, -31899, -27585, 14706, 28857, -2785, - -6313, -23604, -6466, -14967, -29226, -9920, 1038, 15866, -31069, -3307, 32319, -22387, -28244, 6737, - 18493, -16957, 15445, -15181, 18513, -16061, 22045, -12061, -24348, 12480, 11000, -4511, -7076, -28185, - 22968, -31733, -18585, -629, -9404, -31694, -32550, -30379, -9414, 4937, 17680, -29824, -14248, -18465, - 16197, -11967, 11995, -22362, -28950, -2002, 12280, 5085, 12787, 10260, 19461, 10679, -24012, -20307, - 26234, 23390, -8728, -1389, 16857, 18110, -10416, 3640, -24493, 4751, 293, -17880, 18336, 4369, - -19062, -21556, 14249, -14409, 27496, -4075, 6914, 30371, -16416, -16248, -9743, 18045, 22902, 10433, - -3175, -2400, 6510, 22986, 100, -24342, -7563, -6406, 30047, 24088, -10369, 4037, 15101, -12588, - 9861, 13607, -12010, -27227, -25836, 12423, 5637, -6041, 21363, 23285, -9676, -19631, 22648, 6896, - -28137, 3646, -17222, 13911, -13140, 9309, -32610, 9612, 9919, -27835, 13782, -2332, -6661, 4156, - -7320, -30611, 1559, -28229, -29627, 9955, 9584, -4268, 7814, -30949, 9818, 4479, -23362, 21910, - 11870, 8857, -14282, 4792, -15017, -5723, 26988, -1468, -32204, -28748, -1957, -3330, -3417, -32352, - -10551, -28703, 4651, -10489, -18942, -23887, -25845, 24153, -15529, -11916, -28586, 4695, -25492, -13320, - 6867, -27079, -25513, 23681, 17131, 16044, 2786, 9934, -2900, -27557, -7028, 16363, 14038, 24666, - 31453, -27411, 24102, -26835, 6031, -4639, 24205, -5864, 32689, -17769, 7933, -15483, -9469, 4654, - 1654, 15751, 1047, -8763, 28114, -32712, -26474, 29153, 5333, -32062, 14996, -24684, -25206, -19195, - 10500, 4576, 3976, 32653, -21010, -13119, -5836, 19183, 14553, -15935, -14092, 14394, -7545, 25499, - 14125, -28337, -888, -24540, -14508, 20814, 32679, 8853, -11493, -25671, 28087, -21975, -19276, 11432, - 8926, 6213, 19903, 18221, 6009, 10467, -24451, -22124, 27533, 16475, 20181, -8625, 12587, -7764, - 17492, -12970, 1166, -2122, -9814, 3269, -24102, -28643, 18258, -19397, 20978, -10031, -11910, 21455, - 26797, 23269, 22936, -22696, -20407, -6535, -2072, 23728, -19897, -16232, -14207, -27236, 13535, 32537, - -18134, -8739, 9354, 954, -19180, -10680, -31598, 31323, 18098, 20578, 25546, 15670, -12489, -6460, - -312, -27756, -7284, 344, -25560, -4470, -25027, 14637, 27664, 12244, 28439, -18728, -22467, -4309, - -9500, -22769, 4012, -27232, 28997, -31975, 302, -30735, 32099, -23926, 5224, -689, 16446, -24993, - -15413, 6668, -16072, 22540, -12000, -7763, 27699, 21801, -17286, 28610, 13373, 17324, -12155, -26650, - -32756, 30881, -26420, -26170, 9114, -8192, 22708, -8230, -32203, 6773, -21187, 1997, 15774, -14163, - 11338, -28003, -20725, -21636, 9391, -13094, 16860, -11462, -23865, -29374, -6493, 19099, -24518, -21353, - 4663, 19831, -13951, -11189, -30376, -29770, -25136, 16385, -4060, 24352, 1604, 26474, 26434, -12859, - -18577, -31314, 4057, 22628, 10481, -26499, 25342, -11783, -16348, -25859, -19730, 4052, -29594, -19806, - -13775, 5676, -27173, 20537, 3663, 14697, 8562, 2245, -476, -18002, 28951, -23886, -9428, 22648, - 24313, 6548, 11603, -10444, 25951, -16049, 23686, 13769, 25726, -9793, 12930, 23284, 2240, 6598, - 22905, -27356, 20524, 10467, -26120, 14274, -3430, 17537, -23483, 12840, -7505, 15549, -24390, -19869, - 27120, -22379, -18505, 4890, 14618, -2160, 3002, -8396, -12233, 1944, 25830, 16238, -6519, 22320, - -26402, -7777, 6911, -25585, 30348, -17657, 29569, 6162, 15450, 4438, -27869, 4950, -809, -6544, - 9863, -12629, -26361, 16781, -6396, 12061, -1252, 7372, -32359, 3338, -7018, 2027, -1577, -17171, - 18618, -10398, 15691, -19264, -24191, -8166, 10571, 14387, -15913, 20664, 16139, -535, 10292, 28271, - 30495, 26969, -31586, 19306, -4912, -14720, -11140, -23510, -32190, -7702, -11258, 28177, 6531, 26364, - -19113, -27321, -26084, 28491, 24091, -11580, 21206, 8378, 8277, -26475, -24440, -4847, 30518, -23914, - -7898, -10539, 19666, -17562, 6545, 3689, -25859, -21663, -14815, -9796, 22995, 30658, -31486, -27970, - -32443, 31327, 11650, 927, 2562, -8194, -1669, 30503, 2864, 9852, 20693, 3839, 6605, 30652, - 1810, -9560, 1258, 21066, -22799, -12269, 30893, -23801, 31442, -18469, 15293, -27564, -3221, 27282, - 1649, -1696, -16821, 31219, 384, -32602, 27565, 1919, -16483, 10225, -22821, -13313, 24368, 31894, - -1616, -5350, -14680, 3268, -3689, 11149, -30097, 19151, -2788, 31867, 18319, -27171, 31828, 1228, - 21960, 1470, -23490, 27714, -5869, 5970, -17897, -28528, 6002, -31681, 1017, -31333, -8429, 30939, - -17644, 13852, 22114, 9645, 22631, -15205, -3548, 8086, -20880, 3792, -30408, 19702, 23488, 13214, - 7969, 7161, 19510, -10324, 1175, -18777, -16968, 26718, 15160, 329, -8441, 5335, 31819, -21646, - 1519, 28345, 420, 24788, -19795, 5961, 25767, -27171, 23324, 25961, -11143, 32471, -8587, -4675, - -25197, 15999, -8768, -19115, 18558, -18862, -26890, -4897, -11905, -18371, 8219, 20747, 5449, -24421, - -15606, 3148, -5170, 10484, 30774, -10561, -24056, 9317, 22769, 20483, -4184, -4109, 520, 24223, - 17762, -9808, 26439, 20185, -5175, 14806, 5608, -22834, 30561, 4730, 4703, -10958, -22671, -11414, - -30020, -15511, 10446, 4060, 13770, -30663, 3568, 27087, 9585, 10113, -8331, -25416, -3535, -30073, - -16913, -26680, -8853, 2459, 22052, 17529, 14213, -19378, 22692, -13387, 29785, 248, 9633, 9895, - 8824, -18932, -16388, 32637, 14703, -8576, -7686, 26858, 19033, -23816, 26948, 9729, -14037, -3573, - 18520, 24399, -1754, 24648, 3733, -2835, 4257, 26453, -23330, -24839, 3470, 19425, -10257, -11521, - -30720, -17248, -10133, -24487, 16624, -710, -3625, 32396, 9296, 27977, 25976, 22629, 14445, 22326, - 6084, -16319, 24810, -3102, 4994, -31132, 22054, -286, -20247, 4907, 1373, -2, 10677, 13803, - 17899, -2550, -19338, 8618, 16290, 22588, 11482, -21983, 19146, 19075, 16997, 28204, 32075, -6283, - 19099, 7515, 29147, 15456, -16421, 25841, -24215, -8564, -24488, 6861, -9169, 388, 15036, -8290, - -4424, -2692, -836, 27950, -19894, 562, -149, -9281, -28038, -19450, 20166, 7186, 3708, -18248, - 24108, -25501, -22675, -21296, 15614, -7939, 6621, 4511, 12660, -18108, -22245, -21187, 11109, -5341, - 6360, 16179, 21448, 24496, 9856, 23569, -21602, -6305, -9336, -26494, -24113, -12163, 30787, 1397, - 14642, 21625, 29968, 10838, 4687, -26457, -11802, -4460, 261, -400, -30668, -30775, 30066, 16022, - 29741, -905, -14801, -31723, -16546, 25822, -28288, -9388, 11366, 29060, -12523, 13953, 18734, 25475, - 19090, 5050, 4833, 21727, -8394, 18951, 7909, -19115, -4763, -32635, 6313, 11793, -22974, 14896, - -26237, 24435, 23572, 21713, 17345, 20786, 18258, -27536, -317, -10281, -11057, -17107, -15309, 20737, - 15699, -21354, -8586, -27057, 7602, -4212, 28984, 20356, -6982, 8593, 32316, -24004, 31664, -15467, - -20995, -22329, 21777, -1445, 20700, -7152, 31351, -20710, 29783, 25137, 31869, -29681, -1009, 23714, - -29790, 1219, -5477, -21880, 3435, -16084, -14104, 6876, 4678, 20502, -24176, 20718, -25773, -13652, - 28532, -1706, -30906, -23355, -28718, 6954, -12432, 31955, -21067, -32004, -32614, -30161, -9105, 26130, - 743, 8793, -3633, -4422, 11614, 1044, -5686, 11548, -26289, 7177, -13987, 24294, 28343, -1218, - 29153, -29986, 44, -440, 32313, -17641, 18884, 25865, -1658, 5459, 14622, 9168, 27770, -26871, - 9487, 3170, -23447, -31729, 18692, -14211, 10881, 13798, 24, -3458, -17841, 7287, -20877, 24083, - 17720, -11246, -4677, -18969, 20353, -9962, -6621, 26332, 9137, 1361, -27594, 22789, -31743, -30974, - -14385, 17287, 31511, -4900, 15379, -20479, 7911, 918, -14279, -13598, 18315, -11115, -2728, 8320, - 26488, -12598, -227, -23427, -25137, -10478, -32434, 12354, -92, 9627, 20031, 32437, 20190, -14076, - 3634, 22812, 13228, -11099, 5218, -11018, -9587, 1075, -25008, 3592, 32483, -4623, -13734, -24066, - -2328, 21996, -4167, -30364, 30733, -7467, 22351, 4936, 12025, 19318, 23652, -4958, 28692, 17742, - -18507, 27547, 10422, -9514, -6585, -20878, 27142, 24603, -2981, -4838, 6495, 23063, -16550, -17051, - -26678, 11493, -3383, -29290, -19975, -11383, -15583, 4597, 20743, 32570, 3957, -28542, 30736, 23905, - 7463, -6083, 21231, 22452, 19745, 20094, 19750, -18085, -11454, 17568, -11519, -8398, -29064, 29092, - 890, 28710, 21010, 31233, 15519, 31274, 16872, 31769, -23646, 24394, -25948, -16278, 7456, -19560, - 26098, -1723, 1363, 6141, 23789, 3701, -23741, 1690, -21403, -22615, 11821, -7392, -3993, -5763, - 6230, -28411, 4271, 8250, -14031, 20019, 20285, 25602, 13752, -6741, 28071, -26886, 20933, -1602, - -13671, 32156, 11774, -7919, 22854, 13089, -15940, -12060, -25476, 23112, 4448, 13780, -8128, 28689, - 7662, 23383, 5364, 25114, -27320, -8461, 32091, 1223, -12303, -29172, -30179, 21097, -31189, -14065, - -23589, -12277, 2638, 23847, -17214, 10905, -15795, 7597, 6358, -5566, 15566, 7808, 2303, 18470, - 22790, 21567, 10523, 12962, -5115, -2947, -8928, 14714, 32065, -11123, -27780, 30377, -29579, 19453, - -32749, 25873, -3211, -24201, 22742, 1500, 24016, -28735, -24308, -17395, 14386, 22286, -17919, 25267, - -29545, -19174, -16958, 18952, -24217, -26007, -30573, 12979, 12342, 4096, -21005, -10485, -10429, -17913, - 23284, -18970, -3725, -30155, 31241, -32172, 14010, -32141, -24099, 6988, -27093, -13672, -28867, 14721, - 23117, -26189, 4624, -13994, 2533, 17096, -17823, -4476, -4684, 12349, -28351, 3107, -27824, -20536, - -24747, 20453, -7560, 24077, 5722, -20999, -31115, 4330, -3124, 24960, -20351, 2384, -14733, -21035, - 19332, -30529, -19689, 1410, -30176, 27315, 4829, 9182, 23907, -21300, -27896, -19959, 7527, -29253, - -5428, -4678, -31320, -6733, -5240, -16474, -10010, -4251, -11175, -29016, 15582, 5838, 23314, -16395, - 23154, -17357, 30779, 28503, 31487, 2450, -19201, 9496, 23221, 29338, -13571, -31827, 20102, -162, - -12089, 21715, -13245, 12622, -2476, 14899, 12252, 31829, -307, 24959, -13480, -8151, -19140, 10862, - -26313, 29490, 24047, -32456, -14899, -31721, -23650, -18695, 9351, 10187, 24400, -11019, -29487, 20743, - 15963, -21048, 32393, 13741, -25529, 29816, -26787, -17452, -17357, -28925, 9089, -1623, 7994, 23105, - 21691, -1772, -3414, 6831, -7330, 5443, 27717, -25283, 1759, -7536, -25154, 10737, -11023, 19564, - 12010, -20448, -22015, -25540, 14842, -3154, 19460, 3175, 20955, 19385, -11112, 11748, -14166, -21391, - -25856, -24290, -3496, 14532, 30467, 29363, -5397, -29519, 823, 24859, -13095, -30259, 6018, 13127, - 28981, -1643, 17205, -934, -2955, 15814, -3804, -156, -30815, -25210, 2126, 18160, 230, 4791, - 4997, 12175, 24479, 8800, 30963, 32657, 5372, 1964, -15814, -30562, 6720, 16027, 4500, 1732, - -9537, 3804, -23929, 3713, 7949, 28937, -5008, -18164, -19559, 7068, -11380, 12922, -18674, 826, - 30187, -28537, 31397, 28136, -26884, -12923, 16604, -6822, -28788, 18969, -19096, -30351, -8856, -24593, - 10895, 14521, -20692, -6370, 6477, -11920, -25580, 24869, -17760, 2116, -31348, 15445, 30138, -18992, - -32310, 30274, -26173, 30175, 29359, 32082, -3323, 10969, 20190, 10434, -29287, -84, -25015, -9560, - 17927, 15339, 11899, 16214, -6782, 11255, -24297, 23452, 20306, -17060, -29673, 4612, -15904, -7628, - -86, -32703, 28909, 14996, -13944, -20357, -5066, 30401, 25395, 17144, -23784, 28344, -12395, -18885, - -22351, 9755, 16334, 10258, -24244, -3312, 10523, -2696, -12327, 20648, -13558, 247, -30074, -1032, - 21540, -25965, -32080, -25737, 29804, 16535, -22522, -8348, 2228, -24879, 25794, 12553, 2748, 11389, - 7999, -26094, 21375, -29568, -25733, 11890, -765, -32352, 11874, -12512, -9889, 901, 1740, 1533, - 29383, -29310, 9290, -30340, 15128, -12164, -14632, -1890, 18023, -29765, 12849, -19961, 17872, 3204, - 21541, 23027, -28082, 19550, 17507, 30965, 28855, 12186, -28109, 31766, 20416, 1799, -23521, 11819, - 6631, 24778, 22067, -5226, 25250, -9607, -21857, 3547, -17065, -22148, 16622, -15813, 7574, 19258, - 12541, -8880, -16460, -5292, 10334, 13117, 24523, 29202, 3093, -27719, -397, -19005, 24620, 2209, - 3414, 3877, 20494, 23394, -17560, 5278, 14924, -17950, 25756, 14346, -24110, -6788, 12600, -6628, - 5027, -27844, 11840, 15935, -5617, -30645, -13212, -29294, 11797, -18522, 11828, -4006, -8397, 19289, - 10259, -27711, -20950, -1536, 28269, -25644, -11504, 10925, 4454, -1234, -25441, 3308, -13541, 13422, - 5127, 30808, -7581, -14076, 27839, -4741, 17766, -11253, 4045, 21095, -30521, 9385, 5771, 2338, - 15443, -31372, -880, -18203, -32058, 31999, -24991, -2211, 12370, -22605, 7955, 2259, -24223, 21924, - -29311, -30013, 9799, -27593, -28184, 27391, 30392, 29516, -17214, -30310, -30191, 3415, 17939, -27968, - -31405, -23804, 28874, -26985, -16540, 30764, -19835, 13580, 8411, -2937, -29201, 13743, 2039, 24827, - 24621, -26755, -30515, -8885, -4665, -14458, 9810, 7207, 415, -2882, -12076, 19899, 31675, -1320, - 11358, 3878, -13963, 9788, -15316, -27514, -17331, 15569, 24650, 4909, 1122, 9069, 12963, -14679, - 1590, -23366, 21129, -25027, 4686, -25042, -10783, 20977, -2130, 4531, 7713, 22879, -28714, 10086, - -6331, 27410, -24786, 13120, 22897, 6910, 6960, 8726, -29310, 16208, 17181, 28964, -9977, -28011, - 26802, -6116, -15227, -29521, 20443, 22395, 31961, -20401, 30138, -3918, -18217, 26093, -31201, 7880, - 2152, -9403, 16205, 6451, -2524, -29166, 13917, 4534, -30747, -23713, 24007, 7005, 5048, -8430, - 20661, 19238, 17782, -24145, -25455, -1641, -6515, -28309, -24604, 4786, 26477, -5685, -5965, -27941, - 23263, 7100, -21660, -12568, -22784, 222, 11375, -1277, -26644, 32719, -22470, -18139, -31632, -18167, - 1142, -4412, 8223, -21685, 28154, -12999, -8516, -9664, -23260, 12291, -31541, 3344, -24682, -23121, - -12000, -23888, -26592, 18276, -2954, 32622, 10069, 6439, 28473, -16733, 2729, 12240, 22276, 9984, - -27281, 17505, -30533, 22305, -16022, 18095, -24497, -6619, 824, 30440, 21236, 21173, 24157, 10481, - -22682, -30462, 11672, -7091, 20541, 25744, 20043, -30233, -21638, -32633, -16720, -31379, -347, 31257, - -28061, -3507, 15558, 19377, 28579, -589, 23412, 32114, 19644, -18453, -1388, -18100, -4012, -15183, - -14828, 2509, -7996, 19796, -2477, -19734, -2048, 7257, 4257, 985, -28137, -19281, 8955, 19603, - -10291, 28164, 17193, 22780, 12693, 26913, -15279, 20223, 1070, -7984, 5363, -11139, 14431, 13087, - -19273, -6524, 22791, 16254, 9897, 19312, -17351, -13154, 17483, 2651, 29035, -4497, 13918, -11883, - -9844, -17927, -18781, 4657, 2461, -10714, 3042, -13861, -7812, 9402, 7436, 32689, 14880, 25900, - -14895, 12040, 15877, -22829, -16651, 7360, -30975, -30833, 4506, 230, 30163, 1893, 26496, 21473, - 22532, 32053, -425, 18038, -30657, 27064, 24236, -24126, 23329, -21168, -3037, -17259, 31241, 8821, - -20619, -23687, -27352, -2657, -2310, 9875, -32361, 1937, 17902, -21340, -10943, 6352, -29775, -19580, - 16207, 28090, 7636, 27575, 12142, 11515, 5193, 12045, -17595, 29185, 1373, -13280, -12899, 27383, - -17767, -3044, 32625, 10640, 13166, -15840, 8766, -32764, -31557, 31320, 25162, 28064, -28285, -16073, - 16287, 17814, -29395, 3531, 15749, 8562, -14158, 13857, 1412, 17949, -8538, -8339, -28137, -23324, - -27908, 19975, -17153, 7130, 4459, -14162, -31896, 18944, 4779, 1118, 6336, -4295, -18281, 9857, - 16417, -25064, -2523, -28391, -1327, -13060, -21288, 6167, -27223, -5734, -31899, 27893, -24374, -5218, - 3147, -27352, -15624, -10517, 4561, -4417, 9095, 13981, -12228, 741, 18784, 5121, 9171, 15323, - 29075, 5818, -26590, -28065, 27659, 5074, 27559, 25666, 2170, -30497, -14353, -27288, 7871, -8667, - -13172, -12141, -17360, 29838, 15320, 6459, 17587, -13080, 22162, 20188, -20727, -32726, 22679, -7629, - 12176, -6947, 9850, 4567, -24078, -26404, -4208, 11391, -21714, -19329, 30390, 2823, 9126, 1647, - -22855, -13643, 9978, -10538, -20833, 27185, -836, -22655, -21324, 16243, -7223, 6554, -17855, -15493, - 17647, -17326, -6896, -8204, -11379, -21877, 20295, 26297, -20823, -11336, -22335, -21679, -16871, -5578, - -24201, -3692, -11216, 24419, -6561, -20389, -28319, -4889, 17173, 1653, 12391, -12423, -27450, 7761, - -3160, -24448, -11859, 2394, -2013, -30392, 10714, -10025, -13800, -17167, 17270, 5540, -14434, 24239, - -912, 27539, -6329, -21934, -10854, -4358, -3300, 7875, -12273, -21963, -12065, -8277, 29350, -28854, - -2963, 23489, 29263, 21037, -7925, 4903, 23353, -12858, -14833, -6806, -8021, 15693, -16322, -6404, - -4789, 96, 22869, -15732, -5058, -24706, 12479, 29009, -17685, 18246, 3760, -11802, -6061, 22219, - -6498, 1917, -31687, -3508, -4062, -28951, 15822, -24926, 8235, -14314, 26865, -10906, -3533, 6784, - 10213, -18782, 2873, -14654, 12179, 32145, -32563, -10748, 3312, 7675, 10349, -5704, -25911, 26494, - 19974, -22739, 16498, 21584, 6855, 24501, 13460, 6079, 16820, 14588, -16555, -6530, -14340, 32040, - 27897, 15580, -1627, -14204, 14243, -1517, 16703, -5912, -4133, 28493, -17201, 30211, -29726, 181, - -20159, -20686, -1070, -28023, -10178, -28907, -7566, -20931, -12770, 22122, 27940, -25823, -8123, -17077, - 25356, -20139, -1607, -26320, -19254, -25418, -10226, 26710, 30758, 1134, 32099, -16060, -562, -1440, - 32353, 26578, 26533, -31488, -13089, 16438, 27884, 23229, 4848, -5131, -11993, 32496, 27111, 28961, - 25401, -24994, 17256, 5816, -31320, -32468, -5, -28389, 10823, -12983, -14345, -13511, -12786, 26786, - 11751, 13616, -6270, 24532, -21014, 21607, 12986, 24220, 12011, 18262, -14787, -2807, 13248, 32015, - -1258, -26004, -32559, -26227, -24181, 27702, 22725, 21388, 5190, -13664, -27109, -12848, -5802, -8641, - -25365, 13186, 21006, -25501, 4815, 29767, 15255, -12599, 7396, 21600, 21100, 19865, 11889, 17449, - 2728, 20630, 3991, 4899, -18224, 15631, 17075, -32309, -7814, -20602, 22737, -1954, -26770, -27987, - 8859, 23410, 2982, -26771, 13628, -13992, -27657, 32317, 2430, -6099, -19192, 25558, 29546, -21821, - -22947, -22584, -19006, 21703, 2575, 13145, 26859, -15004, -30434, -24134, 25747, -1677, 28942, -7742, - 2302, -21715, 13809, 23238, -26258, 31404, 18779, 29980, -17575, 19575, -7021, 14244, -17844, -19338, - -9478, 24478, 7033, -29140, -282, 25030, 1464, 24413, 728, 24428, -1568, 10555, -22003, 8473, - 22561, -10624, 16137, 4514, 7304, 3804, -8407, -23131, -15088, 29270, -18989, -5228, 18750, 23434, - -10869, -18962, -30228, 17103, -2443, 1766, 13191, -31272, 10302, -8607, -29619, -5357, -25426, 29488, - 8184, -25474, -27310, 12572, 11731, -10505, -17052, 31410, -32722, -6244, 29461, -11039, 28064, 2997, - 31808, 27794, 15746, -14440, 2557, -21179, -27900, 12942, -32564, -18389, -30389, -387, 27445, -6195, - 27589, 10892, 14800, -25541, -17521, 15355, 28440, 2488, 3617, 30678, 7057, 29842, 19856, 19515, - -6850, 23614, -31688, 25367, -29310, -26331, -5604, 13737, 31975, -10405, 20888, 29242, -6634, -22498, - -22601, -6812, -8926, -17352, 541, -22924, 466, 31658, 1220, 1101, 30255, 25893, -21758, -18726, - 12835, -8681, -7593, -4744, -118, 13826, -13553, 22828, -13762, -15745, -29327, 6275, -23179, 23845, - 5942, 30249, -18333, -673, -25487, 31263, 3896, -25553, -12778, -31346, -30515, 19749, -18278, -27493, - 26143, 7144, 4682, -19406, 21326, 22956, 2854, 8629, -13595, -18554, -27564, 29688, -3012, -26036, - 11411, -9163, -14181, -28341, 14011, -1676, 14605, -28113, 29978, -8625, -30708, 6977, 23998, -31846, - -22737, -29192, -9203, 10540, -11643, -28423, -11405, -20681, 21243, 15504, 123, 20525, 110, 7924, - 4209, 11882, 3834, -25865, -25249, 31013, 10012, -12718, 24996, -25343, -5312, -16661, -30456, 31244, - -19867, 7345, 6779, -9548, -9076, -29946, -8310, -5888, 27263, -28328, 3313, 3186, 18944, 22078, - 9175, -29818, -14630, 29986, 6472, 13202, 7076, 30646, 28526, -25868, 23974, -26031, -23508, -2486, - 31406, -14782, -28520, 8402, 13734, -32248, -19513, 2092, -26355, -12590, 3044, 20635, 8858, 25921, - 15179, 28885, 11223, 24021, 18645, -11208, 30558, -22692, 3512, 21050, -30735, -2353, -21235, -24062, - -24735, -11299, 26596, -6911, -28099, -20250, 29087, 8086, 4310, -25550, 16995, -20407, 12620, -28873, - -15386, -4824, 31639, 26435, 9668, -31475, -21898, -13910, -13712, -31045, 820, -16805, 10870, -16970, - 11895, 16410, 12888, -31397, -16606, 26898, 8527, 27092, 23576, 951, 8754, 2104, 25870, -926, - 13850, 29550, -29801, 27078, -28967, 6141, -31177, 21124, 18458, -17335, -386, -27080, 1322, 5912, - 13624, -12545, -17530, -1124, -17027, 24064, 15893, 22634, -22863, 2793, -7549, -7503, 25639, 7596, - 26533, 25306, -5235, -24642, -354, 22224, -29630, 23441, 31437, 28851, 32156, -5146, -7709, -15325, - -6095, -5483, 32463, 16997, 1881, -21580, -31472, 18415, 16553, -20536, -9686, -26048, 17167, -19600, - 30770, 24435, 4168, -17262, 16990, 22530, 9854, 771, 19640, 16193, -19689, 4133, -24939, -19113, - 29215, 20409, -30156, -3142, 4106, -8887, -7681, -21232, -9456, 23532, 9163, -24516, -31120, -2161, - 13105, -30415, -13241, 6963, -25104, 14135, 27183, 6124, -15364, -30386, -27550, -26298, 5474, -11006, - 9613, -18004, 9419, -18233, 22025, 13668, -9105, 29375, 16664, -21471, -30997, 29717, -24639, -29063, - 16913, 20292, 11680, 15729, -12665, -31862, -2022, 28872, 16750, 30133, -15028, 18053, 13041, 24829, - -30474, 23525, -1650, 14837, 7176, -14938, -15664, 30166, -7638, 22410, -19543, 10277, -32510, -8540, - 13673, -13557, 8394, 10008, 12860, -7751, 12822, -32321, -26091, -14512, 503, 31420, 18456, -5738, - -27047, -7176, -29570, -30587, -6818, -13894, 29429, 12572, -21770, 12297, -14695, -5753, -31918, -18034, - 18858, -30567, 29304, 2722, -14945, -4001, -26597, 12441, -31379, -14352, -9092, 24759, -21667, 5519, - -12379, -24305, 14640, -17386, 2090, -16562, 10664, -2080, 304, -12216, 2444, -18310, 11698, 26202, - 4265, 16613, 14148, -15449, 21218, -9799, 13667, -29297, -13339, 18533, 13495, -14392, -20512, -27917, - 6433, 29193, 6251, 16294, -20295, 27547, 28139, -31383, -13695, 16365, 30987, -15926, -5172, -14394, - 7300, 28075, -6449, -4434, 20362, 888, 19294, 16295, 6584, -5094, 12676, 6174, -9202, 11523, - -30259, 7440, -25907, -12082, 1985, -3696, 9524, -2473, 3959, -10135, -1172, 25923, -8778, 3334, - -32285, 24017, -23210, 4664, 11377, -24888, 18870, -2664, -14901, 18674, -4226, -27066, 28373, 3262, - -30644, 29110, -30803, 9038, 12825, 12691, -31208, 31253, -10861, -26945, 28598, 5052, -8584, -6392, - -32581, -3633, -29212, -16998, -3379, -3194, 14709, 9415, 26451, -23478, 7757, -31803, -22296, -11241, - -17964, 19819, -6907, -15676, 28743, -30232, 17097, -239, -3190, 25683, 10335, 11388, -9554, -10230, - -16895, -16268, -9604, -12700, 19548, -31467, -21851, -8208, -31246, -19777, 25057, 6184, 4737, -10964, - -21553, 6952, 29680, -30182, 11815, 20650, -20278, 7609, -20897, 3720, 4106, 25170, -28583, -22223, - 4378, 18344, -10236, 17374, 21498, -14198, -23507, -11441, -26587, -25162, -9158, 3547, -19293, -5648, - 10905, -22615, -29143, -14354, -9180, -23566, 21169, 26862, -25268, 7352, -18702, -20701, 1419, -27875, - -10040, 12172, -16020, -4655, -23465, -1884, 4964, 15687, 24430, -25950, 7906, -12505, 29302, -16159, - -7378, -14812, 20681, 18901, 31282, 28108, 28412, 30036, -8973, 6044, -28363, 1704, -16398, 27825, - 20449, -28445, -24473, -16891, 2207, -19964, 32405, 22675, -21108, -14268, 13680, -13436, -11018, 1372, - 24801, 19669, 1379, 27616, 26987, 14007, -8750, 26940, 32005, 24116, 9793, -12553, -30641, 16403, - 32183, -25774, -27170, 476, -31276, -23011, 10476, 15930, -23079, 31297, 15176, -22316, -8728, 15484, - -1461, 27310, -12917, -30099, -12513, -15633, -30704, 11686, 4151, 14426, 881, 16552, 19651, 10779, - 18809, 18792, -17651, 31830, -31855, 30805, 21197, 24267, 8208, 28152, -1762, -12804, -11764, 2710, - 13209, 31299, -4887, 18195, -16979, -20145, 19879, 3572, 12777, -27245, -8013, -27009, 20355, 17642, - 23934, -2037, -14748, 10358, -21238, 28471, 17873, 23450, -2242, 16007, 13287, -32005, 26905, 31306, - 10269, 23057, 15924, -2130, 28251, -3520, 21647, -16474, -2857, -1160, 28102, 15430, 20820, 27121, - -8834, 29022, -4770, 19725, 5015, -23098, 1677, -4971, 10070, -31442, 11870, -12565, 31830, -21992, - 21291, -16295, 22515, 1862, -5700, -10697, 12417, -11386, -13956, -5328, -24833, -15337, -15724, 4890, - -3337, -28510, -3784, 29663, -5698, -22820, 12978, -31695, -14083, 6911, -20771, 334, -43, 10086, - 23352, 8358, 25506, 1049, 3420, 4192, -19015, -15405, -13747, 22435, 24592, -32139, 6517, 9023, - 20359, 31941, -6347, 16570, 4359, 18688, -4892, 14051, 28736, -21062, 11939, 8047, -5907, 25118, - -14932, 830, 5535, -7126, -543, 23548, 26824, 22548, 707, 17747, 23809, 9756, -26450, 24534, - -12184, -26759, -3240, 6254, -28740, 1976, 6593, 26108, -3313, 9415, 20090, 14473, 27698, 3893, - 3918, -32266, -13546, -10870, 17098, -12581, 27151, 23240, 16865, 26590, -24703, 5488, -94, 12707, - 3971, -18237, -14675, -11961, -14085, 8716, -31764, -27966, 9220, -18602, 8538, 29499, 16136, -10086, - -11667, -22051, 4396, -3003, 5868, 14191, -17888, -10628, -10907, -11384, -11562, -13791, -30747, -23966, - 13411, 3929, -224, -12411, -31926, -18497, -12398, 23362, 1640, -30331, -6599, 24075, 25900, -13806, - 10746, 2554, 7144, -19398, -28292, 5805, 27854, 17235, -5229, 25511, -23305, 24300, 27733, -28633, - -22482, -11356, -26716, 12211, -25620, 22317, -15784, 12644, -893, 13715, 9693, -24327, 29420, 19495, - 18907, 6847, 6025, 344, -13966, 14487, 31718, -31991, -1017, -20891, -5754, 14738, 23049, 2404, - -31739, -9196, -22032, -4262, 23401, -21683, 6176, -8456, 10211, 15350, -2768, -24310, -2118, -25378, - -741, -28135, 24665, 6068, -17173, 20828, -13553, 1789, -6765, 9408, 12034, 18703, -27370, 5299, - 29416, -30179, 9921, 22471, 24827, -20945, 9927, 16783, -1340, -11739, -15679, -2728, 18648, 30206, - 25170, -11387, -6418, -16628, 1521, -31240, 9539, 1132, -17019, -15535, -30493, -17045, -25508, -31202, - 23743, -26286, -5117, -26580, -1317, -12995, 28386, -24783, -15060, 16701, 13868, 2743, -20395, -7622, - -7519, 12847, 25148, -30309, 15350, 7427, 7547, 899, 3222, 5194, 21614, 23816, -22707, 27037, - -13381, -23692, 10564, 3044, 25151, -21680, 19866, -12739, -27390, 24941, 14643, 24199, -2483, 16374, - 27362, 1901, -6132, 6642, -16519, 8809, -31102, -2693, 23923, -9394, -28248, -244, 29990, -30591, - -10164, 13792, -17748, 18039, -18449, -31958, -23663, -8383, -12261, -18458, 18585, 23234, -6980, 2473, - 15210, -20145, 24507, 28851, 5629, -22040, 13590, 29673, -2686, -12847, 15187, 20914, 1110, -18050, - -25096, 25552, 13048, 13929, 27092, -7976, 10125, 31702, 663, -14874, 1617, -14875, 1713, -30112, - -32288, 9971, -5648, 309, 11903, 3938, -23762, 17990, 1124, -23282, 19939, 369, -20365, -11152, - 27683, -29347, 28170, 21337, -31853, 31846, -4775, 3832, 24678, -12830, -8360, 9104, -29231, 26666, - -30506, 10114, -24385, 31194, 23061, 6262, 24189, 26264, 22297, -10975, -4710, 17258, -14583, 32068, - -656, 22231, -11869, -24697, 29587, 16190, 9140, 11188, -18359, 19061, 19180, -29672, -18358, 24195, - 10520, -13501, 24835, 3567, -32090, 12790, -24499, -11338, 4928, 13039, -21973, 10692, -21775, 4283, - -23312, -27051, 28992, 7166, -28167, 9289, -27405, -17088, 24353, 23860, -11635, -32603, -28698, -6629, - 30706, -8803, 14489, 29484, -10696, 31746, 3538, -26113, 32745, 20192, -29981, -16963, -24761, 30882, - -10458, -25658, -10816, -2403, -11886, 14871, 18265, -27658, 17431, 30914, -16275, 19159, 24799, 7993, - 19967, 13049, -13615, 8875, 16696, 23881, -6007, 16753, -9221, -9220, 10131, -3689, 1524, 1816, - -17495, 5826, -15881, 15355, -23095, -31332, -13683, -16833, 14145, 22943, 20420, 6332, 21212, -12499, - -2797, 25520, -28263, -31243, 7334, -14120, 24777, 9148, -30345, -15705, 21556, 6569, -1804, -15603, - 15920, -29518, -2785, 16875, 9554, -21605, 27841, -5940, 25643, 276, -19095, -15012, 26548, -23275, - -18956, -11510, -12683, -14598, -32600, 6955, -1257, 31610, 19523, -13158, -11057, -32382, 4293, -4766, - -28633, -18589, -5450, -19652, 18271, -21998, 27489, 20508, -9338, 21972, -32496, 18743, -23969, 379, - -9973, 21559, -28190, -6875, 25600, 12515, 26295, 16124, -32261, -15409, -12390, 9260, 24837, -4344, - -23793, 24217, -29498, -18154, 23688, 30653, 7560, -31975, -3535, -16552, 12817, 25531, 6440, 13918, - 23528, -5122, -22896, -16127, -17540, -3192, -31242, 7474, -3122, -20297, -14367, -12812, 30843, -25940, - -2665, -8241, -10037, -26562, 17004, -861, 5316, 13147, -20448, -28551, 11486, -17915, 27619, -29178, - 23019, -17787, -19898, 17534, 18243, 10730, -1914, 21215, -7938, 17034, -22955, 28264, 12657, -26750, - 23227, 22304, -8920, 18588, 23710, 9622, -14838, 29537, -12658, -10927, -31002, 24954, -32731, 9658, - 23850, -27741, -10232, -17639, 9262, -5289, -13296, -28813, 9210, -31063, 12396, -27113, 30705, -14160, - -1400, -8940, -17483, -15535, -15524, -5919, -11545, -24303, 24007, 6809, -18667, 31704, -24425, -22965, - 26052, 1010, -13007, -23445, 14714, -5769, -7231, 23290, -14485, 27862, -15815, 21453, 24348, 31342, - -9867, 27856, -5002, 13494, 17676, 32171, -24055, 4543, 10156, -25467, -32747, 31073, -8380, -31984, - -19273, -18787, 12530, 6596, 24862, -24925, 21999, 15876, -6971, -4013, -22448, 4789, -16528, 23724, - -28406, -19042, 24049, 5076, 18096, 23594, -8833, -1229, -26786, 20554, -32176, -5114, -28371, -18376, - 20668, 8039, 6883, -28033, -866, -27023, 28962, -5978, 30299, -5743, -32637, 23305, 20080, -28995, - 8498, -12974, -13932, -23756, -11987, -13861, 10677, -1357, -24986, -15360, 17522, -21873, 3630, 5811, - 3377, -13129, -26316, 7412, 8206, -16287, 6445, -15904, 32200, 995, 24398, 24312, -29217, -22756, - 4355, -29260, -19623, 17797, -25785, 31829, -31873, 32736, -27993, 1826, -22004, 22837, -23608, -23586, - -22133, -3981, 22211, 18270, -13316, 27148, 16866, -14802, 16933, 24191, -26827, 27617, 25307, 22083, - 26443, -2400, 18406, -19328, 20940, -32084, 5379, -18550, -25741, -20787, -8799, 7308, 17140, -13491, - -29601, 30057, 17719, -4313, 9300, -1709, -6609, 20917, 882, -11837, -2130, 31142, -2979, -15152, - 6689, 7555, 20065, 22020, 8265, -3110, 26832, 25610, 2014, 25438, 20766, -3625, 24349, 9074, - -15284, -10124, -10697, 32154, -18179, 30646, -32044, 12766, 10017, 8542, 8491, -12409, -20641, -5962, - -28569, 2117, -17803, -265, -20856, 17685, 9841, -24080, 1168, 32181, 32720, -27716, -24760, -32239, - -18111, 2947, 300, 13993, -16392, -8762, -13027, 4316, -11676, 32552, 19707, -22606, 22925, 17197, - -19724, 31258, -32597, -9016, -27873, -17500, -27253, 8610, -28731, 330, 3131, 11914, -25576, -2674, - -30465, 9313, -482, 2026, 17448, 24113, 29815, -30885, -3205, -31178, 26839, -9153, 5995, -28673, - 19138, -6518, -12040, 10731, -193, -9400, 17585, -23769, -1659, -6539, 788, 18195, -30823, -26710, - 18205, 23967, -9194, -29941, -10865, -28773, 598, 4624, -20991, -14399, 21336, -2912, 30772, 7694, - 5363, 25557, -6894, 9282, -7878, -19665, -23271, 16057, -5558, 24378, 4485, -50, 2994, -24232, - 11944, 19646, -5054, 21257, -3679, 31972, -9071, 5467, 23, -29520, 19462, -4847, 26467, 31080, - -23892, -19304, 15537, -2343, -26912, 25631, 25526, -5169, -2879, -21474, 12282, -12575, 4729, -21074, - 13469, 2932, 31187, -8459, -12608, 31633, 31785, 21874, 20401, -24373, -6337, 14520, -16654, 13755, - -10995, -8677, -6386, 25683, 7833, -15181, 25299, 1367, 15330, 10897, 24660, 6479, -12943, 9021, - -5056, -425, 18450, 26472, -18100, 27464, -31514, 5049, -27682, 6352, 2383, -2106, -10083, -13590, - -12236, -26430, -30008, 6664, 27416, 11953, -24978, 31429, 9859, -21418, 15073, 19428, 31593, -2619, - -13857, 12293, -20376, 28340, 8919, 9926, -17417, -6688, 28711, -15912, -7747, 31347, -27041, 24881, - -21912, 20602, 8867, 15580, -17539, -2552, -11804, 30140, -11846, 30885, -1937, 4500, -28786, -20048, - -14702, -20174, -19446, -13066, -16793, -30662, 24440, 22473, 22770, 20136, -2248, 30451, 30504, 1566, - -3474, 24665, -4402, 1394, 12819, 16446, 21525, 2217, -16217, 9715, -9996, -3650, -9137, -29502, - 2345, -27206, -16987, 12110, 19146, 14977, 29818, 25312, -8221, -6998, -10514, -3086, -2893, 5626, - 8918, -5248, -18875, 2266, 4569, -30269, -4706, 18062, 29990, -11290, 564, 15383, -1797, -22373, - -3655, -29624, -7167, -16261, 511, -25988, 11396, 23704, -13392, 13904, -8639, -16328, -28992, -11225, - -5049, -1126, 11059, 9848, 22965, 22370, -23691, 28073, -3615, -8638, -2980, -12779, 3505, 15151, - 28313, -30789, -30112, -11440, -27635, -9865, -4813, -22300, 8819, -25820, -28029, -8191, -28880, 13584, - -17766, 14297, 30247, -20872, 4585, -32483, 13606, 9868, 9686, -10718, -7613, 19604, 13594, 22298, - -29628, -21433, 9754, 20131, -2044, -20064, -31566, 10341, 15606, 23521, 26133, 21053, 12535, 25768, - -25721, 14734, 4470, 2567, -14190, 19616, 20098, -28220, 17807, 20386, 18361, -14853, -8211, 31089, - -15915, -29585, -27620, -13236, 26825, 8192, 20028, -19666, 32314, -16336, 23140, -26505, 7383, -24016, - -31860, 19859, -16758, -17582, -17508, -31131, -23420, 10791, 793, 23106, 21006, -1496, -18848, 29958, - -9279, -23036, -26270, 4577, -3709, 29185, 2590, -16274, -14044, 19042, 8362, 7965, 7984, -30338, - -22, 9897, 30697, -2755, -10659, 32625, -11599, -5346, -11031, 13843, 20875, -4801, -14888, 10369, - -29463, -20142, 27498, 3605, 21905, 29875, -4231, -6747, -25377, -19780, -5361, -16147, -13582, 5037, - 21313, 11207, -11886, 22337, 26342, 15603, -10253, -8649, 19546, 14390, -25195, -16897, -8189, -8402, - 30851, -15070, 28531, 4550, 595, 5033, 1586, -24267, 19266, -25202, -3212, -28027, -23031, -5559, - 31471, -10778, 16603, 29669, 8423, -29704, -11875, -25314, 31305, 21311, -5978, 28328, 20253, 14640, - 6776, 7109, 15806, -6086, -3004, -10379, 21488, 31829, -19832, -20319, 15683, 25640, -6617, 6252, - 15672, -30228, 32557, 2428, -949, -14900, 15208, -10901, 5323, 21469, 31312, -6005, -11865, -14639, - 9111, 1646, 19592, -25642, -32069, 23982, -92, -21618, -12540, -24919, 2084, 11665, 6942, -1656, - -1201, 18464, -30272, 28483, 5656, 32675, -5369, -6260, -6792, -31733, -1111, 27061, 8431, -10740, - -18628, -20074, -29107, -23404, -23787, 6503, 17760, 24224, 20885, 26909, 6239, -21186, 28013, -2040, - 2905, 26263, 18870, 17539, -7977, -26423, 3300, -5348, 10060, -18828, -30647, 17726, -23166, 4350, - -3066, -11160, 19665, -21552, -27655, -17142, -32695, 25093, -23336, -2180, 29356, 434, -31655, -5708, - -8681, -25331, -17103, -26175, -13477, 3248, -5732, -13715, -21791, -20774, 22895, 22760, -25577, -27207, - -17536, -13948, -18155, -5912, 15794, -32434, -19685, 11680, -31774, -12214, 26314, -26340, 3074, 22756, - 1128, 26385, 24714, 20835, -5869, -3511, 13117, -24210, 11436, -16694, 31826, -5526, -8838, -22048, - 9525, 2338, -3686, -15706, 4567, -23299, -11843, 9420, 3482, 24938, -31547, -2390, -30226, -4822, - 28798, 32535, 3196, -7459, 24293, 22418, -28375, -13370, -25885, 26743, -22667, -10666, -29526, 24731, - 11058, 22833, -11614, -6147, 13249, 3885, -8801, -16597, 5279, 24800, -25930, 5339, -13849, 4537, - -13923, -28429, -10859, 24681, -7634, -20691, 25562, -27874, -7266, -15501, -11131, -20470, -451, 2000, - 8244, 23197, 32430, 12133, 14838, 13064, 20325, 31235, 7047, -3207, 5121, -31917, -16979, 13178, - 435, 25579, 26863, -2393, -30195, -9542, 12471, -3234, -25225, -26144, -2469, -10449, 5698, 5082, - -29461, -14230, -2300, -12762, -16050, -23863, 22839, 1159, -21951, 28614, 20277, 18481, -18914, 25610, - 29706, -7896, 18567, -15000, 30512, -9117, 17221, 28586, -10131, 26989, 1948, 5681, 11249, -18298, - -3005, -15944, 2723, -1651, 15236, 7673, -16681, -4041, 8909, 9634, 2232, 4591, -1398, -10050, - -25446, 30782, 12552, 9616, -15900, -2219, -430, 10176, 7318, 27148, 12143, -24061, 1884, -8536, - 5596, -23195, -32753, 18720, -29538, -23010, 6096, -18897, 8752, 594, 7689, 23028, -3967, 6015, - 11558, 16020, -19569, -29067, -23868, 28346, 24982, 9421, 403, -16432, -26395, 12436, -15645, -30044, - 22970, -17224, 8592, -3469, -4307, -12983, 13005, -21908, -17520, 951, 4780, -30334, -30766, 25221, - -23166, 26267, -6204, -25613, 14158, -28143, 31046, 21098, 23892, 21633, 27587, -3704, 23805, 686, - -5545, -1535, -9279, -26688, -31905, -18994, 10327, -14112, 14368, -31001, 2968, 18042, -7360, -14955, - -11976, -23051, 10844, -26938, 26069, -21283, -30067, -18094, 13043, -13800, 23434, -23829, -23215, 2281, - 21216, 4397, -2654, 18739, -7780, -17067, -11672, -6165, -22089, 28195, 28566, 6165, -24624, 1335, - -30618, 6154, -10034, -5917, -18775, 5384, 24495, -26003, -10160, -19932, -24209, 21102, 30423, 21194, - -30122, 23851, -15261, -30782, -32629, 970, -7500, 24594, 19291, -30760, 5710, 8541, 13307, -14763, - 2036, 16140, 7501, 18790, -5400, -9809, -9061, 3346, -24465, 12829, -24969, 29843, -5631, -8403, - 2222, -26489, -31308, -31542, -25692, 31200, -22767, -2422, -31689, -14657, 20238, 31316, 17548, 16285, - -29960, -30039, 17, 13544, -18546, -14585, -31263, 7575, -30481, -32359, -2948, 20126, -24875, -7344, - 16536, -3970, -17607, -5091, -13460, 9018, -19505, -12105, 15962, 9707, -18259, -29198, 30595, 18202, - 2220, -5797, 7761, -1021, 4895, -27248, -10844, 2067, -9978, -9324, 19142, 12549, -9858, -7923, - 3114, -28730, -3920, 20907, -8343, -2799, 20312, -16982, -30117, -9387, -159, -1380, 31897, 17595, - 14773, -15291, -16115, -13925, -12069, -27574, 12746, 3124, -25932, -27888, 19874, 13955, 9805, 27750, - 30912, -27601, -4646, 28348, -4364, 20078, 18574, -14243, 27906, -17, -31629, 22491, -30499, -1538, - 16826, 22300, -27522, 29618, -32646, 12450, 21807, 19226, -19204, 13729, -9194, -13485, -2452, 6675, - 19390, 21933, 21783, 32016, 26635, 26324, -21991, 29598, -10880, -12333, -24852, 5016, -17720, -31394, - -5041, -593, 9205, 9094, -11784, 10181, -20640, -13438, 27737, -23118, 6721, 15646, -7129, 19278, - -24668, -12820, 1123, 10557, -29059, 7925, -23530, -3093, 27018, 25327, 20784, 14056, 20025, 5815, - -14024, -31194, 13831, -20303, 1974, -11628, -21113, 28705, -29706, -590, -18359, 25538, -17652, 24549, - -31027, 31155, -1246, 20016, 10764, -11312, -16068, -9029, 21402, 21805, -21331, 17777, -20670, 6588, - -9217, -32408, -30607, 21072, -10299, -26670, 17615, 5580, -27911, -10551, -3694, 11316, -12623, -15268, - 2629, -974, 12858, -18474, -6539, 7728, -22435, -24334, 15783, 11842, -23663, 13429, 26762, -18362, - 31251, 13094, -21040, 23012, 15938, 16202, -10114, 52, 12016, 12379, 9864, 10868, -4260, -18799, - -19221, 14099, 6537, 32202, 12475, -16238, -10072, -20909, -3013, 23665, -3763, -7570, 10728, -12714, - 13802, 21420, 12397, -12709, -32645, -28459, 12499, -28113, -14220, -24616, -19897, 29669, 18827, -20679, - 14329, 10043, -18916, -6201, 6486, 22158, 30110, -31289, -405, -18801, -19701, -28796, 13159, -24965, - -23788, -17356, -19435, 15646, 14842, 30333, -31045, 16666, 17712, -14166, 28738, 14923, 25464, -12034, - -12969, 3475, 13754, 27233, -12029, 12887, 32090, 8702, -22568, -27540, -24298, 30513, -12046, 13548, - -14178, -27463, -19826, 8138, 26423, 13839, -976, 10066, 18431, -26418, 5436, 28677, -25637, 27694, - 5786, 10182, -22341, -11503, -9308, -26307, -23371, 28869, -18342, 7653, -12136, -10486, -18306, 28626, - 17957, -7485, 7596, -22501, 13998, -469, -6826, 24930, -14053, -23486, 26011, -27430, -14980, -30175, - 28087, 20649, 15642, -21687, 21639, -17263, -29936, 3679, 16935, -25412, 25843, 14333, -8317, -19586, - -8882, 4509, -29067, -30570, 23543, -4194, -16700, -4642, 13229, -8293, -30695, 28761, -5409, 22563, - -6745, -6440, 1528, -7230, -16810, -8298, -14872, 15374, -6487, -2765, -21251, -11374, -16224, -18484, - 5898, -10929, 9112, -24569, 11306, -925, 11977, -1086, 13831, 31167, -16876, -32443, 16328, 2498, - 10131, 23562, -7430, -15110, 32310, -28450, 18344, 13200, -24196, 26736, -12807, 8320, -5808, 26703, - -23279, -19972, 7825, 6794, 27259, 31622, -27492, -18173, 28569, 3822, 10465, 32044, -14828, -30199, - 29990, 14023, -21990, -25475, -24244, 6774, -28669, 26721, 2063, -20488, -15818, -924, -20408, -11569, - -4422, -24834, -21481, -24763, 21694, -24122, 24391, -10983, 28674, -30444, -1866, -14986, -23220, -17964, - -30983, -20889, 376, -30796, -18480, 14770, -19710, -23176, -19625, 31103, 25309, -3001, 1457, -26132, - 4743, -21837, 17027, -12545, 19368, -22829, -22699, 29876, 26785, -26202, -11280, -1232, -6380, 21471, - -18121, 23197, -26143, -726, -3234, 3430, -31559, 5870, 14049, 32121, -221, -20021, -9651, 11023, - -27337, 14684, 18037, 32190, -8000, 18055, -29509, -4789, -31509, -21820, 6417, -10148, 26051, 6743, - 28665, 12897, -10966, -2001, -19062, 16321, 5735, 30484, -1836, -1563, 20178, 13820, 3620, -5982, - 3130, -10019, 20785, 3870, 10008, 19050, -9679, -7051, -19486, -11218, 13067, 28405, -22098, 812, - -5120, 15886, -19257, -18746, -31072, 24191, -15049, -1156, -11965, -27644, 1159, -5608, 31080, 18694, - 6628, -28947, -10224, 12530, -1069, 5268, -18978, -26616, 22716, -5599, 3757, 17954, -24837, -15392, - 1941, 27001, 15701, 6435, 9551, -14536, 18819, 22193, 18179, -28627, 5517, 1251, -8062, 6520, - 5119, -14720, 26931, -2903, 17006, -24517, -7354, 27759, 15453, 14847, -15746, -23472, 12469, 14581, - 23635, 31043, -10547, 325, -21361, -15796, 27262, -11189, -372, 3246, -30181, -18667, 20728, 1861, - 30855, -31916, -810, 24000, 10555, -19121, 22881, 19436, 13048, -19962, 4256, 17164, 13138, -31247, - 23755, 1044, -3378, 26872, 23462, 13439, -2464, -15641, -5622, 21749, -31518, -10157, -2148, -25212, - 32419, 25252, -26113, -8489, -17429, 2029, -6689, -30080, 6785, -229, 19837, -12187, 11910, 11399, - -1146, 21683, -14542, 30336, -2928, -26244, 22010, 21516, 20942, -1683, 13046, -12925, 29228, 17876, - 22704, 13064, 16063, -8270, -30834, 25484, -22809, -29539, -21523, 2748, -2570, -14132, 15173, -4432, - 21382, -3031, -19639, -3922, 23008, -21146, -5006, -27500, -9071, -30318, 20173, 31212, 14795, -7107, - -5296, -15034, -23050, -8533, 7721, 10474, 8765, -25861, 11871, 19883, 17189, 1928, 242, 8673, - 3257, -21962, -24828, 26717, 11766, -11436, 6917, 4439, 17302, 8359, -6485, -21948, 20595, -2616, - -10961, 12129, -19815, 27560, -13486, -25244, -16241, 6947, -18359, -9148, -29484, 10752, -10531, -13103, - 13473, -22672, -5977, -12401, 19151, -18880, -8836, -28829, -29374, 6306, -21272, -9352, -29616, -6715, - -22889, -18781, 4044, 24219, -2654, 2648, -17490, -8840, -644, 27292, -15126, -2128, -16437, -11879, - 22464, 6517, -18704, 23329, 19336, 91, 10761, 13128, -14144, 675, -28456, -712, -27418, 24041, - -1859, -29618, -8671, -4782, 6311, -31241, -21387, -9469, 1323, -2452, -13351, 7778, 4716, -32733, - 12705, -9672, 22690, 1669, -22974, -5156, 12003, -17829, -6666, 9681, 14393, -28225, 8223, -8068, - 28885, 4277, 19852, 8288, 5197, -31279, -5391, 2087, -5605, 31987, 17531, 328, 474, -24003, - 17479, 4484, -16078, 19124, 153, 109, -25172, -7242, -30101, -8113, -21909, -26305, -3534, -27972, - 323, -4600, 19682, 11530, 24664, 9862, -7745, 7886, 28331, -3772, 25162, -27753, -12582, -14834, - 5583, 14635, -11664, 22610, 23681, 5806, 11077, 20840, -9351, 6045, 25053, -15643, 11179, -23067, - 5888, 16927, -9764, 7026, -6409, -26780, -15108, -5211, 16635, -4078, 17634, 2793, -15674, -32635, - -7250, 30837, 19956, 9528, -4449, -11912, -28288, 6116, -27854, -2510, -9014, -2681, -10603, 29406, - 25079, 10017, -20058, -20323, -2417, -1119, -14138, -32380, 20394, 31070, -2695, -29913, 6590, 32498, - -602, 29716, 6483, -10963, -15355, 7686, 5669, -25170, -18088, -741, -25948, 12459, -12132, -26477, - 25915, -23195, 7188, -31132, 3121, -21881, 28891, -29251, -7043, -19771, 26647, -5461, 10657, 2759, - -17591, -6644, -17350, -27238, 31242, -26202, -16633, -32475, 23566, 28677, 8551, 11595, 7518, 13077, - -15577, -3972, -27772, -32402, -14132, -21486, 9146, 1556, 26826, 21615, -16381, -9336, 14160, 17735, - -10491, 24565, -3711, 15283, 5180, 24577, -19257, 28967, 21379, 22366, -18624, -15016, 13791, 11185, - -26056, 28261, 1555, -22712, -31889, -24636, -18883, 14639, -32281, -18571, -8533, -31799, -188, 22577, - 17266, 17856, -12822, 13941, 19627, 14924, -15933, 6098, -6023, 8406, 22965, 28252, -22455, -27003, - 27361, 21204, 24147, -26475, -22135, -920, -30608, 25855, -8196, 17404, 17383, -23337, 32758, -1257, - 17236, -4018, 6467, 21759, -12121, -2292, 20335, 14277, 17929, -10957, 4268, 31233, 26354, -389, - -13969, 16158, -17578, -12293, 28290, -31423, 29576, -4507, -3755, -18754, -9831, 25816, -1595, 8406, - -32321, 12549, 28839, 19174, -18875, 19118, -15596, -27971, -7517, 10699, 18740, -14820, -15435, -32407, - 19919, 12433, -19526, -28120, -21326, 8741, 4435, 10544, -5061, 14945, -32562, -13901, 29096, 25236, - 29957, -26730, 16067, -273, 13749, 8057, -2059, 21514, -7277, 22249, -31678, -1142, -11942, -23640, - 2428, -6706, -22992, -31595, -15140, 4584, 32590, 14070, -17395, -7193, 31808, 29810, 20351, -17262, - -6468, -28713, -19488, -4762, 18006, -19736, -18566, 8249, -3331, 28287, 17032, 5445, -6553, 26335, - -4817, 20927, -31462, -6682, -11052, 18251, -12545, -22195, 162, -29258, -295, -16964, -7314, 133, - 11471, -25218, 6397, -24519, 28452, 9950, 22625, -9282, -21419, -19489, 23433, -17019, 32034, 23929, - 23918, -5076, -15061, -6817, -825, 1670, 9728, 25603, -22285, 8615, 11532, 30346, 3539, -27282, - -31558, -8136, 20397, 1057, -17754, 13212, 27609, 12337, -8114, 24417, -28075, 15866, 4912, -23807, - -7162, 28705, -2492, 15737, -23769, -74, -32042, 10110, 19864, -3485, -2013, 25504, -21242, 28290, - -8054, -18559, 30865, -13298, 8157, -16346, 5050, -31534, -17210, 4509, 11850, -30950, -8177, -20392, - 22117, -17571, -11142, 6457, -31303, 16221, 6791, 4182, -14308, -7800, 23339, -20594, -30125, 13747, - -28938, 17900, -12760, -5315, 30750, -4870, 9748, 16190, -15641, 15322, 26854, 12230, -9908, 31853, - 5778, -17588, -887, 8538, -13857, -4959, 29174, 13446, 26511, 32279, -15462, 9444, 3558, 17194, - 22260, 19927, -1385, 11522, -812, -15124, 7713, -7572, 11861, -20675, 25422, 16006, -29850, -9789, - 3902, -7042, 6719, -17330, 14489, -10031, 30578, 9921, 13969, -21521, -10159, 29259, 13725, -19437, - -7013, 3737, 27573, -31172, -27494, 4693, 13750, 30203, -32427, 7449, 13271, 948, 2950, 20929, - 14883, 4874, -26255, -22796, 32605, 19851, 13521, -1641, -5512, -14095, 32683, -5721, 12484, 32445, - -3459, -26438, 15544, 28427, -8198, -23033, -11881, -30038, 15350, -1518, -8058, 28930, -13630, 30731, - -5348, -31395, -30831, 5553, -5039, -5413, 15317, 6952, -14618, -29131, 29894, 9638, -29598, -9417, - 6269, -22769, -7403, -14385, -18061, 30385, -31867, -11513, -2332, 19356, 24306, -25940, -21471, -26289, - -27191, 24853, -19369, 473, -5810, -12265, -22222, 2066, -31836, 23008, 19842, 20735, 13230, -2052, - 16750, -9819, 1450, 26325, -22202, -31554, -15733, -3259, -17350, 140, 5855, 30227, -1060, -21333, - -18760, -4172, 11377, 2365, 17595, -3473, 1627, 9404, 1175, -882, 22507, -14553, -31030, 2111, - 97, -12049, -17440, -31005, 28462, -26730, 8491, -6065, 10276, -22078, 14378, 24858, -10699, -23552, - -4623, -13312, 17299, 21112, 16552, 546, -19998, 27585, -31371, -6440, 23662, -15997, 13450, -10104, - -27572, -8198, -13491, 24442, -17823, 28764, -23039, 18739, -17137, 13006, 6491, 16859, -17899, -19651, - 31212, 9985, 4740, -15097, -11577, 26305, 28577, 5746, -11793, -25869, 5486, 844, -17454, 16332, - -10300, -6363, 28010, -5496, -12120, 15841, -2391, -614, 6879, 1491, 11593, 76, -11456, -3049, - -14250, 2090, -32290, -417, -10649, -12283, -32757, 21423, 24308, 3023, -10152, -30939, 20315, -23004, - -11566, -11492, -20252, 8037, -255, -11653, -14565, -25693, -27733, -13946, -22044, -31789, 180, 25471, - 27004, 11376, -19649, -24868, 27504, -4414, 6039, 25203, 3480, -3402, 5318, -18190, 18247, -2683, - 14429, 488, -2751, -4591, 12509, -19348, 22147, -7663, 3037, -6341, -22477, -27665, -9836, -2264, - -516, 15722, -9089, -16882, 5675, 306, 32312, -32494, -10578, 28047, 19635, -3349, 9579, -30887, - -2782, 20584, -2601, -30571, 28563, 4733, 20154, 2288, 12090, 9731, 27610, -11109, 22528, -22771, - 24937, 22881, -10532, 22414, 4282, -9349, -19490, -32281, 27170, -19742, -15266, -8957, -16410, 8981, - -12569, 3733, -13419, 13891, -16191, -11142, -24990, -4989, 319, 1089, -22754, -26345, -10346, -2366, - -17915, -13354, -16912, -20450, -14528, -20075, -19935, -5754, 29748, 15919, -23394, -13463, 18822, -17165, - -15428, -9496, 2989, 20036, -4661, -32608, 1078, -20959, -32236, -11458, -32463, -26341, -10871, -18076, - 6297, -2767, -13855, -4417, 18505, 26475, -5640, -26120, 3647, 11980, -13827, 17406, -9749, -15189, - 14184, 16549, 25348, 12417, 16781, 1964, -529, -6667, 31660, 10186, -25401, 19824, -8886, -24123, - -415, -26680, 1064, 22874, -16427, -30598, 20354, 18049, 386, 25301, -6480, -31146, 30294, 2289, - -32164, 25775, -16548, 32715, -32487, -28367, -32198, -16657, 29399, -25173, -18026, -7240, 2154, 2810, - -24664, 25316, -3288, 4206, -3, 24915, -24166, -717, 24699, -18993, -2577, 18149, -20105, 3546, - 31132, -22160, 21172, -2740, -17390, 15178, -16444, -7396, -16000, -4326, 31168, 32198, 9029, 14273, - -30231, 30242, 30897, 12637, -13823, 13206, 17361, -21896, -26234, -7623, 1204, 5164, -1430, -29578, - -15661, 21590, -10973, 4192, -7012, -46, 31055, 13748, 21651, 12708, -12611, -1756, -5159, 1849, - -24557, -8714, 17087, -32124, -20342, -18605, -22580, 728, 26808, -13934, -25216, -1099, 19743, 20125, - 11850, -19776, -29191, 909, -26108, -15700, 27175, -6343, -5066, -2819, -26352, -31174, -24600, -29954, - 15053, 13535, 27677, 19984, 21244, -1160, 24860, 1285, 7664, -19331, 20072, 4007, -14741, 10069, - 19870, -26388, 6207, 17271, -27282, -13109, -3738, -18266, 11490, -27923, -6919, 13177, -14225, -8101, - 30925, 28021, -22282, -30609, 4971, -32545, 25803, -11565, 8007, 26394, 12000, 9058, 5518, -904, - -29912, 25654, 12711, -30509, 13568, -16790, 24880, 31923, 7980, -25598, 19562, -12337, 21321, -6307, - 18193, -19356, 7293, 25453, 14736, -27400, 5330, 18300, -4223, -733, -24720, -16849, 29919, -809, - 2921, -12656, -13406, 26568, 7059, 25690, 6300, 14560, -17918, -5785, 2115, -7680, 28939, 31414, - -26118, 29722, -1438, -32693, 8674, -19881, -29083, -9386, -5138, 13041, -30939, -8736, 2883, -24118, - 28141, 6215, 15909, 30525, 23402, 15453, -31461, -27660, -8711, 27659, 23992, -7554, -10773, -20071, - -13323, 18773, -18611, 29107, -5221, -9783, 18534, -20320, 12784, 8251, 23717, 19163, -15997, -15131, - 8943, -3390, 18443, -26935, -24911, -22604, -6404, 14761, -10419, 28441, -30960, -3961, -19856, -14976, - -22038, -1557, -2100, 9718, -31198, -11005, -32516, -20655, -24976, -17512, 11244, 6363, 8540, -14064, - 8314, -28420, -5648, 9931, -25190, 29478, -29615, 12758, 13281, -2622, 19314, -8751, 13022, -1141, - -15076, -12952, -31841, -13993, -22645, -1450, -25087, 6209, -32340, 26951, -29836, 20197, 24969, -14420, - -10327, 22841, -28989, -24388, -22607, 17722, -20893, -30792, -6217, -30208, 28400, -25779, 9852, 31550, - -3945, -5568, -22019, 29266, -23278, -2383, 27106, 16012, 5682, 27138, 5152, 12386, -2520, 2188, - 13756, -21240, -14560, 9762, -183, -8816, 25489, -31546, 29836, 17103, 20773, 2707, 5399, 30109, - 1094, -19288, -24623, 28069, 21595, -27007, 7965, -1130, -4849, -9040, -21076, 2239, -20717, 7941, - -29180, -31534, 23841, 19536, 14492, -9733, -2089, -16169, -30718, 10705, 11182, 3592, 26941, 7112, - 7051, 15758, 28502, -22797, -8732, 28827, 22630, -20178, -32563, -5923, -262, -24248, -11922, 7342, - -26435, 9381, -17876, -19605, 3339, 32442, 14781, 15266, -9366, -11508, -22691, -6981, -24376, -12271, - 11128, -2422, 23929, -21015, -6779, -23614, 31116, 5122, -21452, -29651, -8868, 13022, 10946, -30262, - -816, -11220, -1435, 16189, -16450, -16270, -4287, -3696, -30485, -27510, -12858, -30465, 31207, -21520, - 16102, -16121, -8703, -27407, 2387, 18595, 12430, -7231, 23237, 6528, 21146, -22283, -12636, -32485, - -22668, -22437, 9214, -24875, -24961, -6148, 15950, 27367, 11193, -25353, 9098, -21609, 7161, -26506, - 24691, -19927, -2810, 2882, -24189, 3867, -10909, -12131, 1286, 15595, -21353, -31000, 30948, 25020, - -20818, 26653, 21061, -12964, -6539, -31446, 6210, -12361, -4972, 1760, 2750, -26343, 8585, 31271, - 7201, 22781, 16257, -15564, 17170, -12107, 27003, 23198, -18190, -29886, -5877, 1647, 25162, 29214, - -32376, -27975, -11716, 11758, 4343, -10697, 13275, -32403, -8249, -3373, -2251, -21690, 3606, -31278, - 11228, 17349, -14052, 9229, -27420, 5769, 12450, 26490, 5602, 1228, -29892, 30859, 528, 10136, - -6232, -18243, 14790, -24515, -32029, 9812, -30850, 32547, 18331, 20622, 1659, 25691, -15950, 15062, - -30324, -2175, 25926, 14420, 15067, 4990, 3171, -3130, -26687, -31975, 12825, -22912, 21754, 7586, - -1712, 18496, 7254, -27560, -17867, -30300, -26950, 25680, -10848, 21183, -14695, 9348, 18823, 18480, - -16658, 17387, 30657, 7620, 1027, -18477, -17917, -12343, -28535, -13623, 30856, -13529, 16521, 29127, - 17404, 20511, -779, 24268, 30253, 13122, -11294, -11486, -27141, 919, 30640, -21361, -25629, 7613, - -16722, -19148, -10030, -15974, 21177, -7521, 21550, 6396, -23613, 17773, 32339, 11685, -13238, 29115, - -24735, -30145, -17623, 27354, -9621, 7557, -19113, -31796, -26935, 2015, -23559, -15155, 1936, -19957, - 22269, -12508, 3768, 16432, 8313, -24370, 4461, 22700, -11248, 24478, -2851, -4700, 13306, 8409, - -5896, 17293, -18620, -30395, -24552, 26407, -17785, -26079, 20350, 2238, -23083, 15467, 13958, -7175, - -8232, -32723, 1423, 8825, 31319, -4048, -6905, -4631, 6333, -25601, -25737, 24781, -14588, -20672, - -27007, -2872, 16893, -20885, 2677, 16085, 17918, 4267, 7037, -28876, -4331, 19573, -3785, -32654, - 11037, 1799, 20672, -5106, 3577, 22863, -22548, -2041, 23484, -10820, -10683, -18317, -2578, -8182, - -17210, -15336, 29688, -6160, 16907, 30789, -24391, 10037, 26578, -20553, 4051, 28408, 14165, 13610, - 7665, -14760, -30552, 21984, 5141, -13977, -29272, 20247, 21501, -28848, -2562, 4802, 27791, 15732, - 13715, -8463, 5586, -27908, 7713, -24447, 12535, -15060, -16051, 1955, 12867, -26043, -3160, 25148, - -6746, 29619, -7778, -6913, -12813, 16439, 15367, -28851, 25979, 10611, 3986, 749, -26428, 25963, - -9943, 26647, 13816, -28576, 15522, -14522, 32074, -29166, -1134, 23084, 20978, 13043, 30480, 15288, - 19567, 27961, -13087, 29105, -17995, -8355, 10796, 7685, -17905, 23845, 4581, 8595, 8946, 21426, - -24785, 1105, -26213, 22116, 6593, 10905, -29781, -17705, 18656, 16812, 29801, 31953, 3520, -4383, - 2712, 11314, -25352, 15224, -14514, 7690, 26503, -13393, 28751, 11062, 21946, -6347, -28368, -28294, - -15297, -10732, 4518, -2171, -9592, -16790, -6580, 29310, -7979, -3412, -18375, -4784, -9536, 18771, - -242, 29874, -31886, 21143, -10748, -20340, 12486, -25300, -1270, 28531, 28541, 22445, 10631, 20363, - -9026, -7111, -18887, 32763, -10060, 10807, 168, 12185, 1587, -14734, 6704, 9214, 10281, -11027, - -28811, 7339, 31635, -30444, 8040, 24036, -25579, 28834, -21078, 20541, 6932, -6718, 693, -14558, - -6343, 20802, -27697, 32276, -1542, -2635, -14460, -10504, 14712, 19107, -28804, -29342, -647, -10243, - 9798, -6674, 18012, 14124, -29876, -8614, 14273, 30450, 12911, -15002, -27400, 23065, -11339, -13989, - -30933, -30583, 7932, 30374, 13516, 19591, -7546, 8242, -27431, 5843, 20161, 14997, -12347, -21844, - -8640, -13804, 16656, 5163, -28515, 10532, -23946, 939, -12084, 14205, -23405, 13767, -21797, -6068, - -20945, -17628, 854, 19986, -2284, -4061, -11755, -15763, 11542, -24456, 22483, 4494, 10062, -30299, - -6898, 24800, 32554, -10680, -23719, -22285, 3273, -4941, -8085, -26282, -22647, -11757, 28710, -9195, - 18686, 9066, -19054, -26133, -15667, 19606, 30440, 19595, -5776, -6759, 26618, -21464, 23728, 4209, - -24741, -2061, -23634, 26300, 28780, -11770, 13562, 820, 3808, -799, -15465, 28253, 29648, 5208, - -18779, -22534, -6466, 29081, -13353, -8115, 13053, 11401, -16377, -31817, -23319, 5258, 2145, -20322, - 5953, -25190, -31323, 10044, 10645, -26384, 22082, -24968, 16786, 15970, 7722, 13816, 13952, -9033, - -17918, -19376, 30800, -21764, -26303, 10449, -11222, -19173, 16898, 12758, -17087, -27808, -5384, -26498, - 31527, 21905, -3824, -28643, -22053, -31097, 9313, -21149, -26261, -27205, -13177, -13461, -17330, 29158, - 31797, -16358, -2829, 23562, 18103, 21443, 26080, 9902, -1724, -2353, 26203, -22014, -24999, 16682, - -9481, 5784, -25232, -31461, -2242, 2699, 23819, -1076, 8429, -14419, 6467, -6096, -11393, 27869, - 30317, 31095, -29863, 24916, -21077, 24142, 583, -9291, -31136, -17841, 16694, -14702, 14480, -28281, - 13772, 14871, -22633, -16022, 29885, -4959, 32644, 19313, -15320, -7289, -23340, 4072, -5294, 2619, - -4068, -1456, -31522, -3005, 15321, 15467, 17427, -6610, 10318, -15706, -27712, -23064, 4534, 23177, - -26049, 30943, -3009, 3292, 28571, 31544, 4651, -15835, -23688, -12713, -29806, -29851, 2816, 6272, - -20182, 29124, -8846, -22764, 20655, 31485, 6460, -14197, 30246, -25177, -26184, 7009, -32697, 9012, - -10597, -20472, -1043, -32431, -23273, -20149, 11737, 22600, -21655, -20685, -14954, 1070, 26654, -2086, - 10072, -4862, -18267, -17534, 6097, -3696, -28857, -28673, -2263, -7412, -27915, -11637, 8798, -30688, - -23616, -2401, 28046, 31687, 20598, 29921, 5308, 32573, -15330, -3122, 21149, -5278, -1224, 13643, - -9159, 11031, 4454, 9849, -27782, 26260, 14368, -21228, 5938, 11886, 23123, -6074, -7210, 27297, - 5344, 32557, 30510, 17851, 20901, 20674, 6650, 15133, 17804, 7048, -27731, -9749, 23950, 30510, - -10986, -15607, -26118, -2201, 32266, 8505, 30857, -19438, 8568, 12875, -9603, 31902, 1648, 5331, - 30957, 3819, -6994, -10903, 32186, 11589, -16805, -22752, 16951, 5314, -28483, 15195, 10634, 14663, - 3508, -30299, 4645, 577, -27176, -20830, 12678, 5017, -24886, -29450, -12567, -17177, -24580, -18031, - 9632, 26004, -9760, -25565, 6459, -32247, -32616, 7369, 3412, -795, 19073, -24657, 15873, 21228, - -27819, -3063, -15246, 21283, -17124, 4954, -25134, -8604, -11478, -6596, -27627, 31531, 21502, -20268, - 27118, -25574, 5174, -27815, 3310, -24240, -418, -15855, -30941, -4278, 15783, 32085, 22874, 31809, - -11855, -21552, 23253, -25036, 5363, -7924, -13669, 8661, -6175, 7222, 24583, -15285, 4727, 29035, - 21642, 15450, 9549, 18950, 1681, 15005, 26298, -26796, 27267, 16037, 12497, -26817, 28829, 1668, - 12898, -25045, -22041, 16092, -32538, -11624, -25514, -1264, 31707, -12840, 5312, -28997, -23044, -6348, - -16819, 5072, 26707, 2639, -10379, -10672, 25605, -2002, -2505, -2274, -31918, 13967, 967, -27297, - -10232, 16149, 24245, 18968, 9393, 21183, -27039, 7586, -18861, 24457, -27883, 5821, -27420, -16832, - 24588, -14889, 19563, 29705, -12852, 17875, 8273, 8632, -472, 9911, -4289, -24830, 19315, -14183, - 9683, -21103, 6939, 6116, 25272, 23510, -9387, -13386, 24521, 30965, -1658, 24997, 13023, 22753, - -76, -1986, 11964, -1216, -22166, 16578, -6645, -14416, -5539, 7723, 25289, 14013, -1561, 27915, - 13128, 27532, 3999, -26365, -30505, -20354, 2587, -10899, 17574, 24390, 28725, -9520, -2747, 23162, - 26717, -20865, -4174, 21676, 27581, -8543, 13953, -15080, -14709, 22542, -24288, 17476, 16739, 4645, - -5017, 12593, -12267, 12354, 32017, -3561, -13810, -11946, -238, 13605, 30758, -30780, 22553, -14017, - 17449, -3287, -23060, 21358, 9898, 20974, -7829, -10695, -7420, 30536, 22736, 19905, -16089, 27819, - -17447, -6239, 26850, 2965, 13115, 8453, 20014, -7311, 31528, -32536, 15157, -3582, -12820, 19438, - -12007, -15942, -11092, 19347, -29947, -16876, -8854, -32034, -21371, 22712, -13327, -32390, 18827, -28876, - 19129, -27323, -8415, -27338, 21763, -27576, 21753, -16642, 23864, -1444, -19719, -26802, 8959, 29856, - -16026, -20985, 19811, -13525, 27461, -22502, 17490, -20897, -508, 13008, 8910, 547, 5932, -2688, - -32220, 27670, -10902, -20325, 1264, -3779, -12386, 17317, 31816, -24151, 32044, 461, -29052, -10725, - 24499, -20561, 11253, -23606, -15655, 8614, 26649, 11061, -18443, -14419, -19770, -19075, 2394, 10108, - -21578, 8357, -13122, -14412, -11201, 2219, -10158, -6710, -19355, 7147, -28648, -22968, -20875, -28198, - -23992, -21846, 20116, 15140, -12605, 5067, -24656, 22973, -30242, -20628, 18536, -1458, -7319, -8721, - -30064, 28504, -15417, -29238, -28693, -9370, -4517, -10626, -7300, -1883, -22212, 29478, -28711, -30577, - -12926, -29051, -13730, -8501, -5846, -22798, 28999, 28849, 26058, -18514, -24234, 11257, -7741, 23963, - -11511, 25417, -31763, -2912, -9486, 16898, -11690, 23997, -26881, 7507, -23850, -17887, -23258, -25513, - 11266, -492, -25981, -20016, 29352, 30958, -7722, 8158, -10149, -22329, -31320, -9969, -4354, 3882, - 11078, -7600, 14509, 20933, -18561, 16079, 17312, -11451, 19829, -10119, 23300, -26763, 24084, -11657, - -20269, -13516, 23477, -23672, -5753, -9354, 13549, 2270, -24909, -5975, -23378, -10208, -15434, -15809, - 18333, -30159, -27742, 28813, -14687, -17408, -16812, 1802, -23076, 1476, 11214, 23206, 7779, 300, - -28626, -28330, 32587, 8336, -21932, -21391, 26364, 8011, -15708, -5316, 21216, 21327, -15998, 12569, - 10136, 862, 24938, -23137, -3882, -3275, 7212, 22195, 3002, -6069, 8052, -10181, 10545, -12889, - -17550, 24704, 22514, 5853, 11516, 13060, -4230, -10484, 3526, 4452, 22317, -24406, -14867, 28879, - 30188, 31355, 23895, -17210, -13114, -17257, -22921, -26105, -17943, -29342, -10457, -4279, 12929, -11361, - 23437, -27774, 14130, 5078, 20531, -28522, 8999, -13816, 21143, 25719, 20263, -5552, 23842, 19076, - -925, 29511, 28438, -25299, -18753, -21285, 27571, -12887, 22606, 4486, 16542, 23360, 18839, -28380, - -10678, 11390, 17048, 28232, -1560, -18492, -30740, -12395, -19876, 5322, 24046, 9784, -41, 23510, - -16176, 3264, -11672, -21085, -3230, -16595, -7205, 6205, 13348, -3428, 10320, 22993, -14733, -12268, - -6034, -19202, -2850, -8470, 12146, -22828, -9572, 4219, 30653, -14635, -4153, 10300, -31015, 22207, - -12203, -27461, -27260, -17011, 31828, 13527, -5464, -21192, 6482, 2543, -12348, -17957, -2357, -5745, - -27924, -26376, 16814, 28502, -19048, 21675, 21377, 8995, 20760, -6417, -18918, 8233, 25605, -25000, - 25439, -10689, 17287, -2920, -1440, 14175, 29192, -25307, -24814, -22760, -3592, -22036, -5138, 14617, - 12194, 9604, 19666, 4038, -23432, 16052, 17356, 6350, 6782, -796, 28527, 13655, -26003, 556, - -25949, -945, -14175, 19422, -7207, -23561, -23560, -3333, -16804, 2323, 19673, -4791, -241, -18671, - -8644, 5525, 21164, -10354, 16536, -29964, -2264, 26819, -12274, -3373, -32002, 28536, -22065, 1623, - -17472, 24944, 23831, -19719, 10410, -28221, -8693, 17476, 18533, -6954, -13618, -22782, 9371, 15454, - 23888, -29250, -25411, 10219, -26674, 26312, 18566, 22423, 8830, 29434, -23473, -22954, 13110, 19251, - 16764, 5455, 4579, 24728, 23601, 5908, -4051, -27590, -12051, -14228, -7896, 5318, -10912, 1759, - 20089, -2377, -3353, -27526, -15585, 1141, 23632, 10594, -13053, 8450, -17386, 21393, 2807, 10568, - -30843, -12814, 25908, -8688, -13352, 23499, 20299, -3364, 7150, -9485, -31874, 24924, 6632, -420, - 7762, -15401, 27611, -26224, 26992, 30054, 25507, 26597, 22945, -17062, 12240, -4011, -9595, 6821, - 6912, 29272, -21065, 22815, 20841, 22748, -27427, -1468, -22264, 460, 12958, -18986, -24140, 21267, - -29844, 25563, 26917, 28079, -17510, 21481, -3461, -26889, 27122, -5930, 12438, -19247, -29638, 26904, - -19144, -30954, -8734, 12670, -14921, -29956, 5104, -18397, -12751, -13163, 28460, 314, -27095, -4553, - 12427, 3265, -30100, 27344, 5517, 32622, -4864, -5596, -13445, 5510, -6051, 18187, 17845, -31034, - -7428, -430, -27870, 8595, 18982, -8689, 21389, 14887, -31322, -17275, -13383, -5195, -4772, 3426, - 17549, -7663, -11627, 4314, -9870, -17133, 24670, 3317, 22815, -32529, 17547, -10224, 5154, 16527, - 25047, 22379, 27228, 10728, 21223, 22892, 26702, -267, 22962, 21892, -25219, -4995, 11465, -2191, - -5140, -8275, 184, 20306, -12427, -31827, 25191, -32164, -29182, -29592, 24119, 9473, 16087, -26228, - 3580, -30101, 27747, 13447, 5069, 6848, 2917, 8618, -5031, 5553, -16566, 14093, -17053, 9279, - 4653, -25633, 28808, 26996, 15853, 29958, 8307, -5770, -363, -4972, -27876, 28527, -13561, -486, - -26026, -23148, 22970, 5891, -14574, 1922, 31967, -8287, 24551, 7524, 20276, -29213, -29567, 9487, - 16337, 29531, 19215, 31854, -4297, 9607, -4261, 30926, -13879, -23548, 2558, -13049, 21716, 21596, - -24517, -3760, -25622, 18737, -3804, -16805, 8945, 8988, 11314, -25323, -7351, -11728, 26568, -28215, - -7842, 5294, -26201, -18705, 14338, -22763, 23556, -26970, 31052, -29156, 30912, 5309, -11247, 979, - 26622, -505, -7483, -9807, -15669, -21665, -15552, 30797, 24299, 18469, -32142, 23013, -8405, -28595, - 27210, 21756, -21344, 26438, -10703, 9975, -16699, -9222, 567, -27817, -20977, 14741, -11772, -24103, - 10642, 10477, -22059, -31038, -17881, 1181, 26272, -32337, 31573, -21785, -14173, 17785, -6278, 4103, - 12927, 20336, 31297, 16416, -10122, -18080, -2930, 13482, 25341, 5074, -18335, 30829, -9486, 2620, - -9720, -15867, 4649, -3414, -9610, -7292, 13770, -10798, 6847, 25692, 26266, 12692, -30909, 15485, - -1964, -28132, -15190, 28747, -18842, -29337, 23073, 14368, 32321, -20420, -7336, 19412, 10011, 9370, - 22889, 6955, 4047, -10010, -32203, -6365, -17130, -20242, 30087, -25979, -32027, -11406, 6752, -6382, - -32258, 11333, -22672, -12678, 10560, 5853, -28154, 31744, 28170, -15427, -8707, 637, -32381, -21929, - 17502, -25643, 25799, -19476, 26247, -10977, 4996, -17469, 26601, 20840, -20948, 13548, -895, -21011, - 24649, -11582, -20850, -30076, -2089, 8315, -23234, 26860, -10336, -2513, -7242, 196, 5574, 24416, - -14092, 6827, -14130, -16725, 1991, -12766, -9315, -9128, 26817, 6901, -16403, -7448, -12803, -32630, - -10523, 30435, -269, 29292, -27194, -4388, 337, 26452, -9515, 7744, 17642, 1499, -10079, -17602, - 28536, -25368, 13387, -13678, 24417, -26354, -18430, 25433, -18695, 11647, -29748, 9549, 18015, 22175, - 30182, -2574, 703, -17727, 26893, 18276, -7417, -6167, -19238, 27741, -31204, 31546, -4272, -27475, - -8503, -26288, -18512, 13526, -6613, -2672, 11671, -29013, 7765, 3132, 19580, -8262, 5060, 8438, - -2552, -20314, -3659, 19435, -20772, 10896, -5289, 13265, -20860, 14856, 27741, -14255, -23341, -28312, - 1409, -16714, 482, -3937, -4829, 29065, 15540, 8121, 28097, -5733, 14836, 7832, 5958, 7472, - -28670, -30825, 2367, 6256, -15910, 19713, -23295, -5692, -3432, 16951, 32232, -25709, -9114, -13606, - -6459, 29985, 8767, 21342, 16874, -12740, -2294, 5926, 11501, 20289, -4145, 4456, 15917, -15024, - 10226, 25696, 6411, 14985, 25482, -22186, -23648, -19569, -14163, -26616, -1434, 886, 7660, -17194, - -18708, -6468, -27694, 3806, 24946, -14022, -26380, 30273, 25445, -21723, -25580, 21996, -14491, -9483, - -23132, -7669, 9295, -9864, 5655, -32671, 23531, 30817, 18004, 18533, 13076, 17708, -14628, 8045, - -12016, 1530, 423, 20401, 16189, -26904, 2101, 3955, 13129, 4259, 18698, 5683, -23372, 19208, - -11620, 7490, -24066, -20945, 21017, -17979, 31773, -15693, 714, 14257, 7349, -28773, -5380, -10765, - 18170, -19411, 21128, 17812, -1754, 4231, -12543, -1330, -8296, -32532, 20171, 1622, 27650, 11635, - -28391, -8970, -31912, -21457, -7011, -17654, -14243, -8899, 21761, 30257, 2724, -30727, 7919, 1448, - -27820, 11647, 16012, 21255, 31228, -22097, 20785, -27138, -18080, 30872, 26038, -4553, -22451, -29104, - -4587, -12789, -9845, -31731, -17224, -12541, 10881, -10283, -29923, -3115, 3675, 13541, -22569, 438, - 28944, -3709, 26365, -2915, -25921, 21656, 23325, -23298, 28783, -5006, 24467, 17248, 14857, -3383, - 3903, 7657, -2358, -30356, -997, 14453, 2630, -2918, -7381, 29335, -20302, 31300, -25814, 29159, - 6581, -17871, 7748, 879, 32172, -24457, 6790, 9992, -10865, -26528, 2568, 28796, 26781, -3008, - 25248, 5152, 23858, -28414, 24129, 2996, -25973, 16857, -8155, -1897, -12732, 21803, 27297, 8655, - 4818, 6497, -1570, -9496, 22505, 28871, -18350, 20816, 12692, -6112, -1615, 22533, -16213, 909, - -30428, -30214, -9308, 13176, 17477, 4052, -2749, -26397, -11819, -5584, -22295, -4331, -25754, 16761, - 10742, -11405, -7923, 15673, 28679, 22334, -21892, -15041, 18704, 19955, -18588, 15479, -29746, 6418, - -21389, 31352, -8637, 19985, -15554, 25418, 13422, 5674, 17025, -7672, 13035, 5913, -27411, 15631, - 2061, -25755, -26139, 31076, 7432, -27960, 30071, -15433, 24686, 17545, 28731, -26699, -289, 12908, - -5953, -2610, 17097, 11598, 29199, -12979, 26848, 5216, -3104, -30124, 29489, 4761, -26738, -21244, - -15488, 10776, 14008, -14271, 14224, 18286, -6177, -3628, 19238, -26377, -11520, -15229, 13851, 14507, - 20960, -4758, 11366, 28653, 22821, 2277, -27964, -10993, -321, 10825, -19816, -13483, -28679, -11978, - 18663, -8610, -4141, 19401, -11178, 29235, 8347, -24447, 32171, -19472, -25871, -11098, 10089, -24312, - 18818, -5725, -2421, 17638, 18069, -26571, -15909, -16933, -2625, 15718, -28769, -4669, -28313, -22568, - 22280, -20474, 23790, -1458, -12101, 2161, -20917, 20277, -20463, 16994, -26643, -3445, 9068, -23414, - 517, 30224, -9684, -20791, 5826, -12056, -18574, -23441, -32439, 8951, 11545, -20707, 8953, 622, - -3052, -10048, 656, 15715, 17882, 31543, 25666, 20440, 22471, 28718, 9904, -10034, -14332, -14493, - -8595, -30744, -10333, -32199, 29144, 30204, -23526, -792, 10020, -28756, 1656, -13585, -12615, -18228, - 9183, 7164, 11476, 18957, 26215, -3510, 6411, -16259, 17897, -4640, 14886, 22210, -2349, -2188, - 2141, -30487, -18832, 8435, -20084, 20507, 21028, -13765, 4769, 27422, -6638, 8945, 26146, -3396, - -633, -23534, -1236, -15916, -14742, -31598, -13731, -16224, 30896, 21168, 35, -19664, 14726, -9337, - 2378, 15903, -31181, -16275, 13065, -27590, -17390, 10810, -22468, -817, 15085, 27028, 27518, 29713, - 9214, -29513, -21313, 14977, 31031, -32310, 27428, -28377, -18605, -14637, -31044, -6496, -31693, 19486, - 2782, -26130, 32553, -15474, -4645, -14900, -27839, 7164, -32265, 4934, -14998, -13423, -5855, 32436, - 22417, -15953, -17347, -2645, 29639, 32431, 5304, 24065, 6975, 21488, 24712, -23967, 6778, 14066, - 22954, -9584, -6591, 2011, -7402, 16807, 12815, 7139, 12580, 4612, 9389, 25679, 24738, 21506, - -11933, 31077, 25586, -19869, -32668, 15993, 29302, -11456, -21743, 6829, 30524, -6149, -23015, -24482, - -23768, 29057, -6029, 1873, -17830, 8729, 21027, -13723, 30615, -25299, 14554, -32309, 9689, 573, - 27455, -17776, -10629, 2158, -5788, -29827, 29760, 31307, -6505, -20584, -24521, 22829, 32706, 32227, - -31491, 20635, 28630, 28700, -26983, 14717, 30525, 12013, 13392, -22167, 6768, 1972, -25187, 1429, - -17243, -15432, -4241, 30378, -2606, 14877, -8903, -28480, 26954, -26675, 20127, -13992, 24940, -7643, - 14436, 17117, 16648, -6646, -5641, 16473, -8844, 2901, 17001, 15074, -6242, 16787, -15361, 20062, - -5498, -20767, 21209, 11839, -25196, 17065, 22468, 28417, -21995, -31221, 28815, 12395, 7510, -3203, - 25694, 12923, -31232, 4005, -28228, -32663, 21253, -8130, 9042, 12799, -18784, -4686, 22827, 19229, - -30906, 19722, 7664, 25570, -25144, 16695, -31559, 28169, -5934, 14044, -27638, -8773, -7735, -8153, - 29497, -23562, 23668, -19237, -23555, 9845, -16724, 14936, 10027, -22330, -13917, 15115, 30594, -17464, - 1642, -27105, -23198, -1400, -30744, -20765, 22871, -8650, -1080, 28718, 17026, 17631, 14672, -2398, - -11129, -26615, -26254, 6535, 31933, -20883, 6794, 11383, -10788, 15428, -23894, -27513, 4987, 444, - -14183, -8214, -5438, -23433, 3677, -20558, 19519, 10474, -11417, -15608, -4468, -21779, -8825, -29710, - 13043, -24632, 6881, 482, -8545, 20157, -9957, -15308, 30397, 26801, 8816, 1480, 19544, -31252, - 17207, 23861, 28400, 17273, 22861, 12290, -5876, -28688, 13799, 17710, -18481, 11313, 32050, -2546, - 14980, 27422, 1819, -23593, 6252, 32263, -12871, -6255, 28107, 16861, -16028, 21862, -23197, -23529, - 8458, -5722, -2228, -20401, 788, 8073, -24383, -2642, 6826, -25010, 520, 12246, -3654, 274, - -29017, -20509, -23180, -13578, -17064, -21253, -24801, -2920, 17748, -2803, -2190, 2268, 28921, 9970, - -16108, -21337, 6404, -23223, -18171, 10511, 384, -18471, -3757, 2437, -598, -15368, -1080, 19211, - 22052, -13322, -29079, -13731, 5758, 22395, -8979, -26310, -10006, -18149, -20506, -6277, -20225, -23457, - -14301, 30797, -8446, -1480, 27121, -23846, 5707, 9383, 3613, 29948, 23679, -20481, -26239, 26554, - -24989, -1503, -22354, 32348, -22810, 23473, -24157, -28349, -11826, -16414, -8296, 2338, -2652, -9374, - -18033, -19772, 4904, 2356, -12941, 15649, -19132, 18664, -20707, 4582, -22070, 9013, -261, 26073, - -1412, 24297, -16818, 14011, 7230, -15700, -3966, -10587, 13384, -16820, -15456, -17437, -30422, -9495, - 22404, 11040, 18292, 14323, 17162, -30544, 31060, -2094, 4475, 24343, -12309, 20574, -4639, 32482, - -25641, -6591, -20954, -4868, 17441, 9631, 1802, -31039, 7849, -30348, 16957, -18962, -4398, 31018, - 13, -28487, -6756, -32321, 14513, 18322, -11711, 26587, 27272, 2857, -22749, 25475, -30148, 29883, - -25557, 3503, -2240, -7872, 24571, -10325, 27982, -13656, -4880, -16392, -3476, -22008, 15968, 31855, - -13221, 29277, -20254, -10864, -23291, -22770, 31702, -4617, -31873, -30946, -30124, -14468, -31143, -14572, - -11840, 22728, -11574, 19743, -27327, 16632, -7922, 25393, -12370, 9472, 6607, 16550, 20600, -1081, - 10858, -16543, 21803, 22714, -5658, -25841, -5397, -19182, 20953, -24364, 17100, -8422, 19113, 6060, - 29427, 26014, -12842, -20735, 30466, 11115, -20219, 17769, -4577, 15491, 14033, 28509, -24744, 31204, - -27715, -16490, 14060, 16967, -5780, 15247, -10368, 814, -2433, -12874, -9482, -27165, 30636, 5098, - 31919, -12492, 6034, 19173, -2493, -32655, 6262, -18301, -9170, -32251, -21925, -16028, 23264, -6233, - -26657, 32416, 1317, 24366, -27217, 3410, -3360, -18888, 3200, 9529, 4227, 18351, -24993, 9753, - -1260, -25834, -3635, -21745, -16829, -446, 3904, 20322, 19307, -25256, 9171, 26303, 23363, -4282, - -2572, 28760, 29279, 16779, -18243, -27146, -9490, 28042, 27237, -6424, 31919, 29988, -17205, 23809, - 23718, -21252, 24115, -17168, -8743, 22995, 1349, 6339, 221, -4879, -28939, -32334, -25821, -28669, - 5572, 12576, 23973, 8409, 7755, -18946, -1484, 14564, 10875, -10024, -8569, 6224, 14003, 5306, - -3078, 22227, -31854, 226, 4901, 13817, 18155, 19416, -4814, -27995, -31029, -25189, 3130, -22620, - 32686, -13600, 32127, -21047, 15077, -13084, -8558, -31911, -28612, -22672, -16143, 1589, 22372, -23219, - 14275, -32603, -6534, -1834, -10816, -731, -2417, 20389, -27960, -5986, -10069, 15013, -8440, 26022, - -21028, 2854, 13489, -9024, 17418, -24337, -11786, -32482, 18453, -24385, 12884, -27101, -7747, 14290, - 7136, 18881, -9256, -21105, -18365, -1046, -2478, -9737, -20414, -5544, 28011, -11273, 28292, -19038, - 5003, -16661, 2854, -29215, 18163, -9520, -16008, -10917, 30166, 21519, -18811, 18320, -32166, -32692, - 27160, 9625, 1819, 27116, 16493, -13412, 22068, 21881, 25220, 18393, 2659, 26962, -7803, 23490, - -31212, -5546, -11682, -25082, 32535, -43, -28360, -897, 4007, 9249, 9731, 718, 7451, 27302, - -12901, -23889, 16053, -28736, -22901, 10684, 29157, 17285, 17954, -25113, 23962, 8797, 27613, 21724, - 28293, 10415, 946, 30768, 4986, 17227, 1828, -7716, -32251, -8337, -10119, -10314, 19741, -2706, - 12255, 32449, -1455, 25135, -7652, 21366, -16377, 17440, -22271, 3689, 31505, -19662, 2320, -130, - 25291, -7184, -25362, 24566, 8557, -8386, 21086, -3041, 1023, -18163, 6991, 17982, 32075, 25861, - -15866, 31684, 13512, 26347, 32509, -29863, -10487, 10753, 5571, 24839, 10407, 25891, -18985, -18607, - -14304, -3401, -19679, 28293, -27750, -19956, 2483, -6845, -30276, 4383, 13002, 2623, 25391, -13369, - -15195, 10842, -27011, 7632, 32373, -14988, -2198, -23302, 31198, 27974, 912, -17969, -30677, -22236, - 29156, 6021, -23888, -24283, 6449, 23256, 4974, -28413, 9361, -30515, -10275, -24994, -17995, 22713, - -4697, -20915, 11875, 18790, -9033, 22504, -2471, -21892, -18852, 6053, 5054, -28414, -23075, -536, - -25970, 14540, 29407, 1466, 18155, -2103, 250, 5909, 23468, 18686, -30964, -9876, -600, 6187, - -17247, 23155, 24962, -22403, -27596, 14010, 2828, 19982, -15456, 7413, -23202, -26094, 29559, -16164, - -23520, 935, -9087, -7955, -13774, 605, 1727, -22347, -7025, 25267, -23424, -22423, 26078, -9978, - 32600, 25195, 9923, 24354, -10853, -16362, -19961, -31446, 28922, 11806, 17251, 10371, -10562, 29366, - 10134, 22778, 3227, -12680, -19223, 896, 2133, -7975, 2081, 25035, -8514, 7629, 6229, -21796, - -16196, 4912, 27665, -4252, -24021, 2234, -4933, -28183, -16724, -2273, -9914, 19497, -9021, -10381, - 32465, -28431, -7844, -21468, 17185, 27196, 20240, 17337, 5618, -12308, 7997, -5519, 1021, -17943, - 15508, -28629, 7327, -3773, 8061, -5749, -27087, 108, 26343, -4172, -11025, -23144, -21635, -6900, - -14373, 27427, -1318, -24309, -2718, 16444, 16504, 28987, 19254, 32728, 31522, -25149, -30653, -2438, - -10141, 27283, -31431, -10939, 17099, 1662, -8257, 31102, 18993, 23469, 23407, -1108, -10377, -8067, - 13836, -7039, -11742, 15085, 15783, -29617, -27807, -11309, -31362, -67, -6420, -3775, -12567, -2275, - -27305, 15598, -6082, 620, 31949, -17980, 17255, 22017, -29370, -28191, 17763, -16993, -10462, -31786, - 4760, -13714, -24964, 8375, 9246, 29773, -14797, 25116, -7818, -23993, 9425, -15235, -23987, 12533, - -24066, 31124, -8409, 16434, -18951, -17110, 14275, 22444, 21187, -7967, 9777, 14616, 28411, -24423, - 29264, -6948, 1715, 13179, 83, 14240, -14174, -7810, -31517, 23669, -3512, 1344, -489, -31098, - -30355, -4084, 4580, 25447, 23254, 31642, 8367, -18338, 9520, -29382, -24714, -22342, -14789, -23226, - 23093, -13099, -14653, -16328, -8408, 4391, 14280, -18710, 14010, 18276, 5399, -18243, -29575, -29864, - 18262, -9441, -10130, -629, 2729, -22042, 10152, -11578, -3235, 13769, 29134, 28349, 2962, -2158, - -13007, 21847, 31039, -19617, 26492, -29742, 30245, -31737, -30307, -27512, -23424, -13634, 27781, 13231, - -4106, 4510, 12770, -17322, 12162, -22404, -23666, -19111, 9195, 25852, 25886, 16970, -13336, -1445, - -28504, -98, 12307, -22248, -10389, -13528, 8998, -10966, -31822, 7838, -7023, 23578, -25317, 5173, - -28451, 29322, -28278, -31532, 25834, -6957, -29473, -7182, -24959, 14944, 25581, 1441, 18278, -13612, - -20050, -28512, 21845, 22209, 3680, -5869, 13600, -4661, 4490, 16266, -27758, 6626, 8993, -2139, - 30347, 6082, 2825, -5404, 8027, -14427, 15033, 25809, -19632, 29871, 19696, -19309, 4466, 6687, - -18154, -19264, -736, 13504, 26885, -25216, -3767, 17565, -21099, -14480, -31328, -18486, -18874, 31333, - -31094, -14938, 18297, -17380, -25046, -19378, 6556, 142, -15114, -3525, 13778, 4447, -17063, -6365, - -27215, -231, 28585, 16118, 5482, -31412, -18866, 16377, 20940, 7025, -2099, -30988, -31297, -13276, - 8326, -16188, 4695, -10649, 13043, -7477, 13738, -19759, -6213, -11951, 28684, -8595, -9275, 32544, - -21793, -14741, -5627, 21, 4609, 16548, -23959, 15435, -10009, -12094, 31387, 25339, -18959, 20307, - -11936, 25340, 10630, 1880, 19343, 23059, -30689, 29643, 13038, -26017, -18983, 29370, 17245, 1462, - -6340, 27644, 13035, -32249, -9184, 29182, 31890, -8700, 9511, 15805, 15847, -9918, -14155, -11093, - 3193, 26374, -28850, 5577, -13817, -17135, -9912, -30684, -26727, -31449, 12783, 10854, -19444, -10366, - -21426, 23776, 26548, -450, -3543, 16332, -16980, -7947, -32294, -4861, -28195, 4245, 21810, 5116, - 8701, -26782, 27664, 26112, 4262, -10016, -21721, 32623, -7034, -12004, 2953, -17750, 17262, 32695, - -19518, 20430, 31504, 3799, -8949, -95, 21598, 32653, -29395, -8765, 23440, 18958, 9837, 15504, - 25572, -31529, -11075, -6875, 17308, 20088, 26348, 14976, 4934, 25251, 25466, 20680, -18927, -28763, - -1514, 19551, 11121, -17851, -16742, -4475, 31443, -10467, -30096, -10274, -19285, -8167, 20048, 21685, - -22231, 24417, 18859, -8568, -19794, 380, 27320, 18996, -669, -14584, -8356, -15328, 12925, 31110, - -4718, 10713, -28816, 8035, 27865, 17659, -15109, 9853, 27228, 8492, -13584, 20963, -6146, 22864, - -10185, -22328, -12376, -21405, 21715, -1355, -24657, 1397, -14605, 19117, 19143, -13343, -2625, 28878, - 5927, -27477, 217, -29656, -25301, 28531, 29243, -12335, 23735, -8117, -15766, -28898, 30949, 22278, - 8169, 750, 27595, -10398, 6039, -26354, -9640, 10822, -31423, 10105, -28374, 6663, -26455, 16218, - -21599, -30484, -22801, 4724, -19484, -24103, 1584, -6989, -1243, 19097, -5489, -8930, 13084, -31385, - -28711, 10401, 20965, 26016, 12887, 26949, -2693, 23487, -4788, 21477, -10217, 22422, -30312, 23443, - -3314, 6784, -819, -2411, -6225, -18661, -5195, 11045, -18995, -4700, 27044, 9938, -2550, -30144, - -5489, 12662, 15892, 19546, -14749, -9395, 3999, -7051, 15738, -20925, 20136, -13709, -20167, -4378, - -22093, 12960, 19289, -10400, -29121, 28847, -17710, -350, -21455, -21581, 2505, -6766, 26960, -25934, - -1501, 24944, 8160, 3452, -9297, -22678, 2097, -23411, 2478, 10688, 9668, 24284, -3144, 4289, - 1643, -14355, -10371, -16019, 2467, 31881, 27977, 2393, 9590, -5603, 20408, 5075, -21115, -16878, - -32145, -24003, 18660, -16033, 32014, -27799, -24381, -15501, -1775, -4920, 27770, 26283, 20313, -27944, - 9929, 26723, -14835, -8702, 9283, 20222, -31833, -23749, 24983, 19245, -8495, -8445, -5138, 13024, - -31630, 10375, -23277, -23942, -6094, -5251, -11665, 842, 30143, -25986, -22075, 15379, 4875, -30350, - -9760, -26025, -24707, -28386, -5486, -30710, -28736, 4905, 17186, 4343, -15534, -13320, 251, -28056, - -12072, -4496, 12629, -25580, 19454, -9867, 13264, 3956, -26835, 5426, -25789, -15392, -32071, 10457, - 30793, 26580, 26560, -15205, 1761, -31584, -8199, -11936, 3466, -20054, -12441, -8760, 23924, -2714, - 24574, 23789, -12917, 12978, 18883, -19746, 2943, -32406, 23948, -17009, -26530, 8058, 6367, -4610, - 122, -7457, 30231, -21282, 22516, 32284, -20994, -18745, -6232, 24019, -6053, 11613, -31523, 26686, - 30177, 8277, 7693, 9284, 24702, -28620, 20781, -27911, 13338, 3296, 20315, 26648, 23134, 981, - 21297, 32598, 10637, 3811, -5112, 27386, 23285, 19936, -298, -26659, 5815, 12379, -23490, 183, - -6216, -21968, 23704, 15320, -5520, -23631, -24337, -23416, -28316, -31915, -17075, -13521, 15395, -25866, - -2163, 15261, -22227, 11064, -9236, -7433, -295, -6290, -28402, -20058, -18126, -31113, -19476, 5444, - -12520, 5655, 10018, 100, -12594, -15553, -26639, 10932, 18321, -12109, -30953, 24569, 29369, -4415, - 5508, 24576, -13001, -16338, 7641, 32213, 16560, 25320, -4376, 8221, 23696, 18724, -10225, 1969, - 15562, 27570, 14816, 17629, 23737, -15963, -25477, 2536, 22579, 14036, -19116, 7744, 29604, 9754, - -8130, -19930, 28221, 16149, -14649, 24967, 25614, 6072, 16219, 13555, -15248, 10740, -8380, 12811, - -6809, -12054, 19927, -12505, -24000, 5217, 19647, -2868, 16765, -31177, 30642, 20762, 29277, 7239, - 18151, -23925, -12990, 7872, -4387, 4261, 17592, 17042, -12097, 15066, 6248, -22142, -4898, 8555, - 24381, 21961, -16088, -32391, -19509, -32597, 30621, 29848, 22791, 1191, 9932, 25468, 28614, 23276, - 11977, 29885, 22560, -23823, 1107, -9391, -2143, 4068, 28849, -29729, -16275, 13438, 13165, -23112, - 4197, 20086, 9028, -29918, 20115, -3537, -7247, 4243, -25963, -14125, -17756, 4030, -20109, 4414, - 26122, -23093, 22312, -12161, -23648, 25862, 16371, 5410, 886, 24204, -10191, -24572, 1498, -20987, - -5679, 26357, -30798, -18646, 9688, -29758, 23358, -26371, -24446, -12454, -8074, 24656, -8830, 13073, - -29481, -19344, -29277, 30683, -18725, -14883, -21984, 6041, 14658, 10657, 28156, 9989, 16501, 25674, - -23550, 2197, -28326, 4986, -11275, 26828, 28908, 22109, -2078, -9294, 22117, -4501, -12940, -7380, - 19793, 3880, -31370, -2733, 27362, 20427, -6355, -1367, -28474, -29436, 4172, 30887, 18138, 32585, - 12583, 17750, 436, 29829, 9990, -19704, -28142, -24569, -9558, 8037, -16849, 12437, 20286, -7613, - -17253, -21871, 22527, -29548, 30149, 9160, -13631, 26489, -10651, -23768, 3393, -7911, 18335, -19603, - -7845, -10085, -8021, -15614, 28236, -26257, 16063, 8509, -17926, -19312, -28497, 10943, -28270, -18723, - 17183, -13244, 25634, -26155, 29788, -22142, -16386, 6164, 22283, -25399, 32068, -9044, -5321, 2809, - -15614, 4065, 19114, 31821, 2493, -207, 2717, 13297, -27006, -12161, 1908, -17586, -19871, 30572, - 12851, 962, -31549, -9530, 31525, -26828, 23369, -29453, 8803, 3671, -23755, -31641, 10656, 17970, - -4132, -13675, 25009, 9850, 11340, 30441, -22755, -8917, -5951, -5685, -20861, -11633, -13689, 26842, - 7245, -21894, -21564, -31159, 15195, 10161, 21724, -32604, -28203, 171, 24059, 9591, -23791, 27347, - 27272, 20283, -21579, -9669, 27154, -9360, 13480, -7113, -5533, 24261, -13529, 12524, -5203, 15596, - 9369, -3531, 24917, -13719, 24889, -17357, -23605, -3401, 27438, -19237, -1262, -9385, -24582, 29873, - 17471, 15870, -20342, -29128, 19991, -3186, 28558, 14696, 6653, -16147, -15316, -7731, -29621, -19979, - -16673, 8066, -12817, 7546, -6767, 24156, 4949, 12554, 20544, 27419, 13112, -4213, -29423, -22752, - 4401, -22820, -1672, 32710, 31922, 21843, -27352, -14995, -19588, -23001, -14040, 26121, 16201, 15066, - -27828, 10675, -16325, 16110, 21004, -25266, 7539, 28693, 16069, 15846, 20055, -31501, -28451, -7012, - 30222, -19814, -3639, 6430, -2336, -20877, 5932, -17164, -12656, 18931, -8834, -162, -3258, -12323, - -29957, 2223, -17472, 11234, 9531, -5131, -18731, 5471, -14536, 24858, 29050, -1355, -28168, 14183, - 16689, -11191, 31393, 6354, -12215, 16913, 3077, 16480, 27833, -32738, 12542, -8594, -23693, -29744, - 13683, -28527, -7575, -8446, -18652, 23904, 2387, -30180, -14002, 22586, -7782, 31464, 27877, 30479, - -23029, -42, -30436, 14226, 11103, -16921, 16271, 9512, -29041, -17201, 6327, -691, -21493, 26726, - -24734, 10157, -29152, -3409, -3705, 1417, 27897, -2185, -30969, 21869, 17428, -20368, 22756, -31800, - -15104, -31468, 4096, 16348, -8754, 14584, -20043, 13711, -13388, -16387, 28687, 20335, -28173, -19886, - -22695, 5855, 27391, 9011, -24055, 355, 26101, -574, -2608, -16541, -29262, 20949, -15371, 21862, - -22195, -10194, -9589, 10899, 25148, 21989, 4567, -9586, -12190, -26482, 17988, 22815, -15120, -9445, - -18374, -11374, 18057, -26641, -9096, 23559, -20535, 6710, 20556, -18104, 446, 16715, 424, 28046, - -12812, 10904, -18650, -1383, -1372, 5091, 29678, -18521, -28552, 16275, -16487, -18336, 27610, -1043, - 30893, 5185, 20646, 10707, -11250, 3517, -29346, 4316, 4558, -26864, 1187, 29476, -25540, -23143, - -1162, 19469, 23625, -30332, 29776, -4346, -30782, 8298, 19490, 25593, -19419, -14136, -1232, -26708, - -18596, 10406, -12536, -20954, -14192, -15508, -16073, -27868, -1248, -26630, 6300, 31849, 24986, 17172, - -22770, -30827, -30406, 30071, -4816, -1412, -16439, 29413, -6035, 31285, 15779, 23098, -22324, 30997, - 15313, -109, -18824, 30703, 10253, 19382, -6099, -27088, -24454, 14635, 9127, 8352, -358, 28538, - -17092, -27465, 7279, -10976, 25549, 3144, 20294, 8003, 27447, 755, -28478, 3607, 28229, -1448, - 262, 32082, -470, 1360, -21093, 13781, 29784, -12044, -16576, -31131, -8714, -13293, -18298, 12976, - -185, -22362, -9112, -1455, -12172, 13482, -22710, 9542, 5153, -8899, 13302, 24811, 21071, -10272, - -11184, 16935, -17916, 31104, -3930, 19913, 646, 16524, -29430, 31683, 12646, -12172, -920, -21593, - -31448, 2483, 25984, -18121, 18054, 3375, -17205, -15311, 26626, 17889, -19112, 29995, 2984, -31236, - 13837, -7329, 9832, -21087, -1313, -9370, -9712, 30868, -27854, -10292, 32102, 30032, -24628, 20608, - 5955, -18141, -1841, -10502, -9186, -29068, -15511, 18440, -16162, -16642, -18249, -29440, -2362, -28726, - 11308, 7682, 27070, -26497, 14424, 9227, -202, -4068, 17244, -26622, 21072, 7482, -19146, 27587, - -21874, -23237, -29125, 18370, 24758, -28732, 12443, 13592, 5067, -29856, 10333, -10367, -15684, 18629, - 20875, -5348, -17461, 21448, -19028, -22873, -5516, -13422, -25501, 5542, -4341, -25920, 2726, -15446, - -8963, -31707, -1842, -30608, 20719, -15249, 18605, 27945, -23850, 726, -9223, 10478, 23119, 17111, - 13540, -22979, -12998, -18808, -7928, -739, 243, -31392, 17861, 22887, -26042, 14267, 26125, 3984, - -26113, 18101, 17357, -8093, -5937, -25392, 198, -28630, -18212, -10819, -2317, 12950, -2694, 22574, - -25045, 1078, -5561, 20653, -32142, 30754, -25685, -5836, -12399, 1340, -13537, -20448, -3944, 30708, - -6650, -26716, -30433, 23031, 32744, 18104, 9200, -1651, -19930, -19315, -7922, -15316, 31418, -9132, - -21154, -11853, 4492, -9971, -18641, -22246, -21300, -18654, -14459, 13916, 25866, 16903, 4153, -13891, - -24242, 14127, -2622, 26983, -5708, -9563, 9693, 21323, 15392, -24042, -28233, 15874, -27947, -18766, - -30088, -23698, 17835, 4997, 30825, -8017, 22118, 21945, 29321, 29507, 4715, -3636, 25581, 2861, - -29587, 29313, -28854, -5905, 29259, 29571, -11248, -10201, -20343, -18464, -5607, 5740, 13743, 11952, - 8514, -27634, 20575, -5513, 9155, -20111, 12489, 16315, -1397, -7391, 19952, -8879, -15730, 26687, - 25673, 20804, 229, -22993, 9173, -13225, 18020, 2902, -23238, 3469, 22777, -3770, 3094, 11433, - 4658, 26761, -20056, -17397, -21833, -24823, 28024, -22481, -9128, 8680, -25512, -15982, 28164, 19525, - -3125, 4122, 6232, 12081, 4822, -25607, -18072, -13145, 675, -1371, 13467, 7466, 10470, -13353, - -4724, 19187, 5480, 28845, 28580, -3492, 27115, -12371, 21103, 31985, -13047, -16033, 24604, -28947, - 11707, -12947, -21793, 15999, -14718, 7601, -29989, 13191, 17635, 10684, -2797, 2513, 31429, 27851, - 30772, 14912, 8317, -21798, 19778, -3007, -2198, 29803, -6264, 27414, 19819, -29661, 14713, 858, - 17419, 3811, 32149, -16482, 31639, 26861, -7865, 12524, 468, 23231, -8359, 29698, -15867, -1600, - -19137, 15512, 22887, 6876, -7590, 6890, 15248, 31806, -12755, 11893, -32599, 29258, -124, -19892, - 24129, 24994, 2415, -282, 26163, -4797, -28491, 24784, 18440, -19825, -24926, -6427, -21428, 293, - 13788, -10533, 12555, -28887, -32077, 18962, -17654, -19195, -24954, 29546, -23645, -21720, -2915, 14138, - 32732, -27588, 12116, -27844, -2269, 11893, 4350, -29570, -3117, 9901, 16320, 3369, 8167, 15790, - 16950, 18826, -26443, -18457, 27613, -21475, 21071, -13768, -24264, -11121, 32063, -2043, 818, -7271, - -23875, -19222, -22689, 11887, -26126, 4804, -16404, 18212, -5383, -1857, 3303, -145, -6932, -25440, - 10555, -5567, -6667, 12895, -26417, -26957, -3448, -4768, 11796, -12701, -31982, -10424, 437, -32450, - -22034, -30323, 7921, -31700, 29657, 17416, -18667, -10719, 15126, -12659, -31424, 30791, -16166, 18210, - -17633, -28826, 5876, -16037, 28265, 25969, 5100, -10040, -23793, 24935, 8174, 3938, 26334, -9396, - 9566, -18691, 25583, 889, 32656, 32212, -3004, 29049, -17561, -20444, -9324, -3330, -2718, 18434, - -6111, -24284, 1780, -11606, -11554, 22189, 7304, 32172, -29554, -9831, 8706, 5336, 18878, 264, - -7096, 31308, -18337, -16943, -16643, -251, -11928, 3235, -15013, 29015, 12544, -6762, -1823, 26286, - 23866, -32572, -21595, -23785, 13399, 24210, -1574, 21902, 7168, -28325, -9623, -6326, 22056, 10998, - 8084, 16763, -6453, -27483, -19122, 9467, -3025, 17816, -19227, 32645, -21199, -26210, -16617, -14153, - 7567, -13308, -4139, 6295, 22680, -23023, -1878, 28938, -17469, 3219, 21223, -24332, -19766, 28435, - 13172, -24692, 21479, -15159, 13921, -30156, 5418, -24724, 21712, 32311, -28337, 18417, 27975, 14068, - -3236, 3366, 4053, 23875, 520, 8667, -13258, 6877, -4685, 18378, -17767, -998, -5732, -32601, - -9669, 1662, 14574, 15786, 15043, -7389, 14156, 17513, 2283, -8181, -4789, 16600, -29627, 8289, - 10093, -14196, -26169, 19489, -12312, -30903, 27799, 31509, 5563, 5107, 19756, -31984, -21872, 28487, - 13661, 8018, 20473, 2274, -19114, -10398, -29817, -1308, 22087, 26943, 25017, -13917, -31600, -9295, - -21058, -13183, 11879, -24007, 13536, 26053, 1767, -19638, 22047, -17861, -12959, -4176, -8715, 21887, - -29598, -4725, -3518, -22911, 30543, 24701, -19225, -3973, -23428, 8872, -2723, 17687, -7775, 714, - -9991, 26050, 15255, 16546, 20181, -27153, -11737, -7797, 16254, -30194, 2988, -27436, 25722, 23467, - -26886, 18992, -17246, -11879, -22385, 17053, -32612, 31603, -18581, -21850, 9680, 1676, -29665, 17822, - 17162, 9811, -15201, -14635, -30095, 16541, 24550, -11414, -10070, -12778, -29936, -8142, 14590, 6161, - 10093, -4413, -7173, -14135, -17676, -25373, -15887, 20736, -1921, -15655, 9218, -20434, 29302, -31105, - 30025, -8016, -4394, -8072, 18488, 2243, -23767, -8800, -20314, -24592, 5979, 5424, -2815, -26062, - -21908, 12967, -20858, 2870, -12078, -509, -20352, 9450, 19497, 12617, 17186, 19550, 3338, 10807, - -30223, -22056, 13386, 26254, -11179, -14528, 1717, 19391, 17505, 15368, 11838, 17512, 29958, -4024, - -27871, -9508, -23442, -17177, -25419, -2396, 12923, 9300, -19281, 2509, -6846, -1465, -28893, -11618, - -4885, -10323, -10088, 19496, 26826, 6301, 15931, -5842, 31291, 20850, 14531, 21980, 371, 24269, - 21651, 1642, 8983, 20627, -24548, -21469, -5975, 4622, 12098, 4599, -139, 1775, 19805, -31672, - -6772, 28421, 32731, -25255, 13223, -25198, 13452, 23085, -20697, -1534, 24058, 21194, -26713, 28507, - 7832, -9261, 23527, -32384, 5497, 19721, -5102, 26542, 23708, -31913, 10996, -23756, -31330, 12902, - 3917, -30750, -2358, 17020, -10214, 2147, 10956, 28855, 20131, -28854, -31641, 26800, 14962, 5548, - 21944, 4107, -3421, -18335, 3939, 6635, 7849, -13110, -3063, -26409, -15023, -26532, -30584, -22788, - 2165, 14535, 26588, -16106, -26110, 3128, -4575, 6417, 20305, -26713, -8410, 19263, 27573, 230, - -19634, -10043, 21519, -26867, 6426, 14073, -17679, 1722, -12755, 1220, 11371, -3888, 4397, 9224, - -7211, -29304, 25847, 28017, 22036, 16787, 14316, -16633, 5440, 28217, 25405, 6802, 29841, -22362, - 29151, -2540, 18132, -6653, 22142, -118, 12193, 3492, 19968, -5575, -1909, -18738, -22913, 17593, - -6511, 4494, -8367, -4131, -9271, 299, 21672, 27245, 6381, 25717, 25567, -8055, -3136, 14867, - 20269, -245, -32234, -28671, 11506, 14540, -22428, -24964, -17385, -2101, 339, -1870, 956, 9273, - -22167, 25923, -29589, -21428, -14549, -14460, 9928, 7039, 19852, 31442, 18220, 12804, 1697, 22006, - -15689, 19171, -20641, 9694, -27194, -22808, 21562, -8417, 2533, 27124, 24809, 18240, -5065, 31872, - 25167, 29055, -15050, 29850, -5382, -1346, 5217, -1399, -11362, 12336, -20908, -23315, -23949, 30083, - -30945, -30212, 23475, 1812, 4415, -20068, -3012, -18628, 6780, 23736, 7354, -17460, -7860, 5094, - 25249, -29043, 13634, 11173, -5354, 17505, -24997, 3360, 13607, 22626, 30624, -3298, -24955, -28789, - -19368, 30813, -15924, -30986, 3172, -13340, 11149, 780, -22730, 23714, 29735, -20983, -13332, 16900, - 7657, 957, -14587, -5670, -26752, 16991, -21113, 30656, 1018, -9426, 11749, 15992, -22291, -17693, - 6528, 25195, 32174, -31403, -8702, -4864, -30663, -28854, 11329, -1109, 15476, -30256, -4405, -13933, - -14312, 25732, 25274, -26692, -696, 12834, -9703, -15227, -27865, 20972, -30310, 21047, 16268, -32023, - 1382, -5557, -32629, -20171, -30117, 13320, 14932, -25736, -30005, -30688, 30250, -469, 4308, -6855, - 4130, 26345, 21420, 30672, 8160, 22841, -11033, 11275, 14486, 21977, 28129, -7031, -8119, -5930, - 23938, 6098, -23226, -18179, 7968, 29683, 12149, -18773, -19879, 24961, 27116, 20136, -18955, -27349, - 17713, 14878, -32235, 19108, 6039, 31697, 12238, 29, 20634, -27468, 7577, 12595, -12896, -26015, - 12025, 31880, 10813, 4548, 13471, 24262, -9249, 29197, 27508, -4666, -5745, -24916, 30116, -28962, - 692, -25514, -3322, -8568, -27043, 31068, -3865, 4399, -6772, 18493, 6299, -28318, -796, -17627, - 22296, -3546, -29354, -29829, 12791, -1397, -5891, 16606, -16242, -12289, 3629, -5522, 29283, 26174, - -12221, 5177, 25934, -14492, 2862, 31870, 11600, 7256, 32329, -42, 3591, 13960, -28818, -15554, - -31732, -8728, -2489, 15574, 15970, -16148, 25840, -10338, 27837, 16746, 21703, 4013, 29524, 3852, - -18561, 24556, -30131, -23021, -4625, 13398, -16582, 10449, -10027, 5269, -8927, 23317, -32537, -17969, - -78, 17644, -20796, 4330, 10346, 18064, 10728, -5218, 8745, 6363, 7824, 12045, 6474, 14744, - -32227, 10145, 27552, -8556, 15695, -9947, -8898, -31460, 10502, -25423, -17683, 29294, -3553, -23247, - 23965, 20360, -25089, -29173, -287, -8968, -15691, 20152, 7519, 8990, -6411, -11470, 4851, 13584, - -2645, -2864, -2908, 5963, -22305, 24108, -28201, -20681, 29141, 10128, 28937, -1132, -21782, -24262, - -26755, 16008, -16541, 21012, 29016, -20754, 26716, 22926, -13509, 11319, 353, 15707, 11827, -11163, - 12366, 14896, 15972, -18861, -929, -3875, 1777, -5175, 14522, 19688, 14787, 11675, -14783, 2238, - 21331, 30759, 15389, 4875, 28406, 30960, -27449, 26757, 24716, 25519, -20860, -22652, 3934, -8244, - -29293, -23882, -440, 8864, 29917, 5672, -4891, -11491, 4671, 11443, -29074, 17767, 889, -30508, - 31816, 3817, -20752, 9495, -22018, 12487, 27758, -25474, -9223, 3510, -8579, 18872, 13200, -7886, - -25806, 22655, 13645, -14497, 2905, 4398, 19728, 6369, 19432, -13843, -26282, 20052, 15738, 7222, - 15848, 32319, -32434, -11851, 17246, -5707, -17570, -6423, -30765, 30191, -20329, 27431, -22779, -29217, - -4925, -29975, -19685, 30418, 29580, -17737, 12236, -274, 8744, 23955, -7347, 23741, 21660, -10207, - -6033, 29853, -10003, -19328, -15090, -3579, 3821, 28845, 20134, -5454, 25184, 25273, -26588, -1310, - 4813, -358, 9861, 20213, -13533, 30974, 21699, -18981, 10311, 28814, 24226, 19843, 20384, 14027, - 12132, 13080, -28725, 31032, -16782, 14959, -16848, 19005, 6769, -31837, -9104, 12452, -1033, -11038, - -12710, 25022, -29111, -25429, -18547, -17690, -32311, 1956, 31376, -10292, 28126, -9943, 10810, 16819, - -18194, -16498, 15060, 7134, -15637, -31532, -16559, -10711, 27062, -11570, 9303, 29822, 31774, -16421, - -12724, 24198, -32768, 14479, -702, -13344, -18688, 13213, 18820, 16641, -32734, 978, 20466, 12632, - 10489, 14327, 20532, 4449, 17766, -3446, 6506, 7106, 30006, 4645, -25467, 13585, -236, 22572, - -676, 12063, -28301, 6900, 4933, -10332, -8903, -8264, -31558, -19860, 29214, -21907, 13524, -7719, - 31450, -10548, -10145, -29504, 15041, 12294, -9235, -12372, -26199, 5979, -31891, 23464, 12197, -32242, - 10603, 32698, 6690, -27060, -2707, -23903, -17428, -16524, -3150, -30014, 14774, -23180, -1886, -31090, - -14997, -12612, 6765, -11067, -29062, -18023, 24002, 2813, -25537, 23136, 27575, -664, -31674, 32645, - -25409, -15231, 1918, 28762, 31298, 865, 32362, 30128, -19022, -20015, -28888, 9574, -1049, 14557, - -30921, -20670, -31695, 24181, 20724, -31385, -26088, 5878, -10075, -16213, -17376, 13992, 32581, -9252, - 16586, 1319, -29277, -10760, -4239, -12590, 26022, 30630, 32582, -852, -27214, 3147, -17590, -19215, - -18894, -15386, -5637, -15108, -16800, -1969, -18019, -23770, 15401, -26263, -24270, 27194, 22709, -904, - 4096, 24619, 16293, -18535, -18691, 20232, 21850, -11775, -28493, -19982, -21557, 7652, 1931, -27050, - 31980, -20998, 10703, -7100, -16945, 9231, -21636, 7706, 10007, -5418, 10979, -18154, 17753, -19686, - 12869, 17262, 1415, 3625, 14992, 8652, 11055, 7982, 27064, -2210, -21509, 12500, 24185, -7321, - 32529, 18279, -13310, -15756, 18397, 11170, 29671, -16778, 795, -25107, -19527, 22674, -11094, -7625, - 1251, 6530, -31769, 23127, 14028, -4892, 4210, -20050, 22118, -8955, -15513, 26995, 3275, -5100, - 24418, -8111, 25831, -2451, -15063, 10513, -28555, -26285, 25, 9789, -23217, 5029, -9910, -14651, - 29079, 25221, 25796, -13105, -9552, -26802, 6460, 17598, 21268, -6152, 25073, -20793, -14837, 31462, - -2677, 68, 29785, 28970, 3293, -24111, -11905, -11713, 32081, -752, -18822, 16326, 24206, 10628, - 3456, -27570, 15939, 25217, 15185, -26681, 16394, 5403, -19524, -26510, -11471, 13235, -8296, -17754, - -15200, 29787, 17458, -26157, -12404, -18077, -25301, 28383, -6656, 8063, 19500, 19202, -2923, -25708, - 6947, 28368, 32008, 8281, -21261, -27097, 31648, -28824, -6105, 9080, -20309, -23702, 1460, 16292, - -22989, 11696, 23044, -20047, -31729, -22864, 5587, 20366, -23980, -12515, 31605, -282, -8984, 32569, - -20403, -10076, 849, 6795, 31298, -21536, -28415, 690, -17565, 17453, 10120, 20161, 2176, 13934, - 16716, 29589, -22759, 30387, 25161, 23625, -700, 23717, -11393, -32268, 4515, -7881, 31790, -8563, - -17849, 4640, 24667, 9767, -22306, -27455, -19518, 16559, -25556, -13043, -16569, -17625, 28838, 2223, - 2297, -8024, 20921, -22354, -3560, 1841, 27998, 14879, -15301, -22708, -20142, 2946, 8958, -17770, - -19370, 10052, 22073, -22666, -3139, -3712, -22413, -28682, 22926, -1815, 21220, 29359, -30870, 19936, - 4929, 12583, -23474, 32269, -30831, -26365, 1836, -30422, -25181, -27486, 8181, -11757, -2825, -16114, - 32141, -2779, 4385, -21932, 162, -4676, -4830, -32692, -11257, 17974, -32114, -11130, -32094, -24414, - 4971, 14611, -23336, -30546, -17659, 16000, 22338, 2370, -25388, 1796, -9213, 12925, -1743, -4264, - 24354, -8441, -2577, -32646, -19173, -20755, -18406, -32448, -15252, -21372, 2726, -16058, -24368, 24791, - -4896, 13043, -17491, 3703, -357, 32455, -3165, -7210, -25802, -11129, 395, 20753, -24567, 21451, - -16294, 20711, -25805, 27408, -15171, -21, -14731, -8991, -2686, 24724, 13592, -373, 9703, -13248, - 10999, -24833, 30972, 12572, -24328, -18659, -12032, 11007, 9068, -10938, -139, -18089, 20976, 6855, - 1761, -1689, 22509, -18254, 29940, 18384, -28298, -6890, -25151, -27291, -11265, 27697, -2932, 32078, - 14141, 15392, -32582, 10031, -8522, 29415, -15811, -1514, -14818, 20590, -16045, -2055, -26759, 25734, - 28897, 22415, 780, -8483, 27432, 22916, -19955, -28062, 31008, -7933, -30289, -20725, 28621, 15464, - -230, 23964, -22626, 22032, -22443, -9303, -22546, -12397, -29811, 13514, -11288, 11571, -12950, 2616, - 24381, 2994, 30044, 14961, -20132, 6415, -3517, -22844, 5833, 12475, 29895, -13055, -31645, 13812, - -13557, 25233, 8113, -7655, 12667, 7278, 30214, 31685, -21279, 1664, -18992, -21130, 3301, -17278, - -5400, -32522, 24327, -22086, 19629, -11409, 19826, 25425, -23301, 27695, -23989, -32678, -26817, -5612, - 14979, -2852, 2094, -4240, -20, 26694, 24041, -10653, -24885, 28132, 8742, 19035, -15782, 6487, - 855, 3306, -25309, -24598, 21509, -2947, 15550, -18207, -26757, 8415, -29093, -27013, 2238, 2310, - -19096, -11286, 13924, -16031, 32553, 16011, 28488, 20360, -29414, 24786, -9974, -27266, -11658, -30511, - 25426, -26131, -23828, -31184, 12231, -27296, -22928, 26666, -5024, 5954, -23101, -1579, -32695, 7974, - 24209, -1808, 11008, -11736, 24635, -9810, 15943, -17914, -23862, -2011, -6171, -21101, 30895, 1974, - 17461, -15, -27687, -20992, -2993, -22822, -14108, 6024, 12665, -11939, 20109, 20228, 23972, -32617, - 2835, -4742, -28535, -7379, 22668, -29421, -26254, 14676, 27953, 12092, 25473, -10988, 32279, 11387, - 31147, -9461, -17566, 5497, 7873, 10730, -18991, 7102, -31001, -24911, -614, -10387, -21155, -19951, - 14378, -31137, 9156, -24952, 29895, 7610, 26452, 2639, -27182, -29228, 21049, -31482, 20799, 18049, - -9267, -19491, 28956, 19245, -25397, -9904, 19748, 19499, 13449, -27834, -26650, -19820, -19836, 8209, - 8724, 25921, -23780, 12927, 18205, -12128, -11845, -28457, -5254, -27985, 1997, 29239, -26932, -24943, - 11822, -3405, -13251, 6128, 27706, 22256, 21723, 4419, 11818, -6734, -28437, 4493, 11518, 9583, - 27656, 25163, -19235, -8372, 19370, -29900, -20846, 7131, 10011, 13234, -2026, -20348, -4742, 16284, - 3959, -25703, -15300, 30163, -10173, 5159, 32118, 14302, -29876, 16755, 8460, 11686, 16018, 3512, - 31124, 12644, 16676, 18096, 10493, 4993, -16978, -3432, -14541, 11793, 27081, 27728, 17145, -18462, - 13457, -14418, 25879, -20395, 22025, -21794, 21720, 29234, -5970, -24273, 16641, 23152, -8414, -22671, - -29038, -24102, -26135, -1993, 17560, 22830, -30863, -11600, -26011, -6628, -7384, 26457, 8775, 15751, - 28228, -15525, -27802, 19066, 19203, -4605, -14095, 28776, 25004, -2213, -30187, 28774, -13876, -22604, - -2687, -16944, 22560, 27895, -18584, -30738, 21452, 7754, 17232, -12994, -9093, -31266, -238, -9795, - 21083, 11307, 21378, 12583, 31471, 21667, -29213, -26535, -28054, -29134, -11495, -4123, -25227, -3155, - -10976, 8012, 9600, 5382, 19723, 21873, 1942, -19310, 14434, -8677, -26172, 15766, 25583, 15092, - 10647, -9560, 5161, -27691, 13299, 18685, -25168, 23163, 20514, 166, -31794, -22177, 7242, 19255, - 4969, -10304, -20416, 15251, -9836, -10539, 26253, 9100, 22805, 25414, 5275, 30572, -5182, 1287, - 28972, 31497, 15379, 2292, -18945, 26449, -13376, 29386, -28233, 13960, 19824, 5831, -22351, -20884, - -1581, -21697, 589, -6370, 20932, 3364, -4051, 19723, 20836, -24482, -31046, -2300, -23973, -3198, - -30136, -4595, -24393, 28648, 30067, -19853, -12871, -16599, 2255, -8066, 20776, -15103, -62, 23824, - 13940, 15706, 16856, 17353, 6508, -32703, -22262, -27796, 8426, 25219, 2889, -30325, -28920, -30143, - 8587, -6036, 7688, -23841, -6683, -25494, 12992, 17730, 30502, -4817, -1710, -20221, -29479, 18789, - 2345, -5385, 1848, 4437, 23005, 15556, -29528, 11275, 8723, 8102, 18673, 13422, -9479, 2571, - 10608, 100, 7574, 24783, 27927, 29854, 22539, 189, 22545, 16339, -30371, 21311, 5915, 6081, - -23982, -25950, 15005, 21913, 18200, -21066, -21554, -26921, 30502, 32289, -24766, 11974, -10902, -17727, - 17612, 3276, 17534, -30917, 17383, -28648, -19923, 30722, 1392, 27313, -23302, -27655, -28141, -20159, - -20866, -9804, 23204, 17384, -21073, 12000, -20619, -22515, 11345, -29631, -15035, -9865, 31356, -7084, - 27150, 10656, 1269, 18052, 3884, 24253, 2507, 30157, -7568, 11995, -18741, 479, 1592, 3846, - 1124, 15637, -2438, -21228, 27492, -18812, -25359, -2933, 9885, -7531, -32612, -17826, 14619, -30667, - 2280, -17147, -16453, -29813, -26253, -29219, 25616, -27397, -17025, 22086, -4556, -10228, -4467, -3291, - -1790, 29247, -12431, -22514, 24250, 30253, -22275, -12834, 8172, -11578, -30497, 27415, -30448, 22058, - 31281, -9713, 27203, 2305, -22296, -10169, 28885, -30648, 26120, 20775, 21802, 16646, 31484, -31113, - 9540, 31705, -20758, -4372}; - -const static __attribute__((aligned(16))) int16_t input1_element_s16[] = { // 16 x 16 x 64 - -658, 6904, -16343, -7276, -19660, 14889, 3120, -24801, 29431, -29351, -16494, -17879, 8409, 30892, - -19877, -11957, 30298, 18420, 12814, 9479, -4080, -8399, -19021, 22888, 27837, 2378, 29204, -4928, - -3885, 17903, -26209, -26933, -9075, 19302, 5052, -24227, -25782, 17040, 14128, -21087, 21733, -27079, - -29104, 13014, 20991, -10250, 23048, -15030, 30623, 9835, -20616, -27949, -740, 23799, 6798, 6459, - 19204, 26402, 27323, -28379, -1925, -8027, -11361, -17496, 6989, 16270, -17199, 16834, -29128, 32502, - -10870, 27211, 9348, -29701, -10538, -8433, -22319, 26949, 32014, -23584, -10399, -3296, 17464, -25213, - 21372, 21530, -31098, 302, 22721, -28002, -27373, -21958, 14975, 21733, 14199, 8364, -13603, 3275, - -4438, -32730, -20051, 153, 29971, -26627, -2391, -10100, -15787, -4410, 10358, 2074, -3069, 4693, - 29925, 28331, -12767, -26451, 23681, 7359, -14247, 18873, -31908, -24323, 14236, 8354, 11399, -26409, - -16298, 26783, 26218, 27913, -6523, -10536, 28338, 30405, -26162, 10204, 11092, 19849, -20659, -16585, - 27787, -28237, -16130, -25004, 31767, -13940, -15780, 4511, 12321, -1930, 16509, -8692, 18558, -28030, - -14205, 19837, 23088, 15578, 3578, -31612, -15334, -21452, -11431, -18829, 16874, 24889, 8791, -31194, - 19313, -17240, -31619, 9655, -7305, 9660, 28369, -11067, 11005, -20670, -28590, -26458, 12931, 20803, - 31169, -31314, -709, -1149, 15690, 23203, -32335, 9138, -14050, 8513, -19946, -27674, -13175, -11973, - -9495, -15843, 13711, 8566, 23429, -5402, 3682, -11595, 7293, -4097, 21688, 19762, -7252, -17228, - 12750, -25539, 21768, 5717, -20043, 3680, 6514, 16373, 10909, 28215, 8945, -23685, 6045, 27846, - 27991, -23730, -26581, 30472, 12611, -28667, 9361, -17652, 18320, -27991, -6673, 11589, 4539, 5429, - 31400, -24869, -21144, 2100, 30900, -5406, 28787, -13266, -5305, -2305, 7873, -1849, -4110, -16182, - 1601, -21510, 2887, -9435, -31935, 8080, 11952, 22002, 23292, 11981, 20852, 22773, -17620, -29826, - 491, 7617, -15093, -10119, -12627, 3693, 27511, -6177, -9041, 8094, -28808, 12476, -3576, -19054, - 29093, 2960, -27843, 15505, 27180, 14735, 6168, 6787, 12746, 13132, -21210, -5425, 799, -1345, - 194, -9835, 22790, -28312, -16980, 26079, -21368, 10517, -4120, 21992, -5436, 14955, -8625, -25638, - 30368, 25942, 2731, -14860, 17891, 20855, 10990, -14358, -1763, -24383, -10340, -6815, 30552, -12373, - -3803, 2227, 12943, 10354, -25313, -24729, 22758, -16891, -18221, -20225, 29866, -30803, -20217, 4811, - -4390, -426, 6130, 17345, 16826, 26608, 3769, -17141, -3660, 10717, -19520, -20806, 22128, -29456, - 28955, -27121, -22138, -31664, 12477, -23337, -28206, 27672, 14612, 18982, -3997, -28919, 32011, -2241, - -10362, 14326, 23940, -8090, 8055, 13976, -26259, 7029, -18431, -14635, 16705, -13561, -10742, 14526, - -32258, -2304, -5733, -16881, -17288, -21574, -4944, 29578, 3795, -29874, -26820, -18749, -24682, -32705, - -11006, 32350, -12004, 23043, -17178, 5352, 12839, 8863, 6097, -25750, -9563, 24913, -12709, 31379, - -31878, -3330, -8955, 10094, 14372, -2449, -27718, 28008, 22608, 20026, -7276, -12294, -20426, -22473, - 1947, 16982, 28168, 1303, 30302, -28553, -9655, -10047, -19003, -518, -17817, -13140, -21633, -21263, - 2152, 28314, 11580, -4144, 12685, -19291, 2618, 15754, 9086, 4372, -16886, -3154, -32447, 17442, - 13703, 9931, 4856, -1448, -9385, -30256, 22910, 12029, -9802, -26019, -6112, -24281, 30888, 27793, - -2097, -27609, 29310, 9196, -13990, 29036, 32036, 30066, 7036, 10881, -6732, 4906, -30664, 7180, - 22346, -13810, 7551, -24642, 14350, 1165, -8058, -17444, 2896, -18949, -9689, -19262, -19812, 31883, - -8543, 19790, 6934, -23450, 24926, 8692, 31651, 21887, 19977, -31420, 810, -15040, 22156, 2415, - 744, -16439, 24294, -396, -31644, -20444, 15755, -30887, -8764, 29295, 28081, 8827, -24707, -8323, - -31613, -9850, 24447, -12195, 11829, -2686, 25289, -8544, -27425, -24353, 6680, -25956, -25946, 28026, - -6600, 6344, 7046, -3818, -492, -5038, -1015, -18262, -28066, 11194, 11925, -17314, -28416, -16788, - 25790, -9562, -28415, -459, -11558, 25516, 2596, 30433, 28721, 29218, -809, -12906, -26688, 21200, - 25356, 28036, -3950, 31850, 16198, 6311, 14792, -16906, 21621, 26434, -19647, -17606, 11724, -24228, - -30612, -31262, 26573, 7908, -19326, 29680, 13341, -14799, -15679, -12901, -5259, 31980, -12063, -12413, - -29593, -20943, 15926, -10551, -19005, -31263, -16016, -13914, -3206, -15576, -10426, -17817, -11445, -30896, - -25376, -14299, -5891, -16187, 19338, 32353, -4231, -4095, 25663, -19765, -26189, -20088, -21381, -16610, - 2561, -19797, -775, 4353, -11744, 10966, -18330, -7415, -26029, -17184, -1640, -28565, -16267, 24321, - 5326, 1222, 5908, 18948, 12303, -21764, -16280, 23474, 22230, -29981, 25166, -31117, 6671, -8396, - -16498, -6160, 24705, 29738, -17282, -25833, -10628, 11775, -3768, 19324, -20340, -11059, 15748, 7626, - -19174, -27627, -4022, 16179, 28686, 21297, 21032, 4941, 6776, 12090, -16563, 24278, -32162, -15376, - -1781, 11066, 16710, -12282, 13761, -10058, -28767, -17886, 12006, -20840, 26886, -24589, 9778, -4148, - 25764, -11944, -25016, -28762, 26320, 1962, 14059, 8111, -26467, -27646, 4225, 27918, -10752, -31461, - -18688, 20148, 23819, 21256, 3158, -7200, 26560, -1812, -30890, -19260, 12426, 1770, -16973, -1603, - 23612, -14898, 7329, -31484, -9493, 9745, 11713, 6509, -3442, -19227, 517, 21032, -3279, 2569, - -20805, -10934, -6670, -17531, 9270, 20215, 32564, -336, 9800, 16743, -29615, -30190, -5750, 5374, - -8178, -18220, 15269, -19878, -644, -19811, 22234, 22887, 26686, -19123, 6107, -21295, 32225, -7333, - 16942, -31599, 18119, 22429, -27506, -31157, -19950, 17640, 29700, 19989, 4746, -10314, 9454, 10349, - -25220, 24571, -5802, 18622, -25926, 7740, -20347, 11399, 3899, -10394, 14251, -3542, -22935, 6644, - 11322, 27336, 14016, -18820, 26966, 13445, 28088, -7700, 16023, -31838, -17308, -2108, 25949, 5251, - -22753, 19801, -28715, 14553, -21141, 22927, 11983, -13167, 20175, 872, 16356, 17619, 25243, -9329, - 19303, -6576, -22278, 9828, -24660, 29451, -22582, -19937, 28581, 27624, -23597, -6017, 5936, -26433, - 29065, -24554, 17053, 5074, -1553, -3467, 5921, 7026, -31803, 13918, -24506, -2916, -9886, 14690, - -8192, 23050, -26466, 11092, 29631, -22615, 7200, 23930, 20900, 17732, -5429, 16475, 13666, -13926, - 29331, 7317, -10699, -13254, 27308, -4534, -8293, -13691, -8630, -4740, 6235, -14796, 8529, 26237, - 22251, 8186, -17167, 6051, -21873, 776, -7730, -29542, -22628, 9534, 29561, 32322, -11977, 8390, - 8764, -31765, 16042, 20079, -26858, -3411, -31723, 16407, -22626, 16858, 29429, -29429, 12743, -32153, - 4173, -18057, 25904, 3170, -21052, 6703, -14421, 7878, 11207, 22181, 3390, 27064, 21264, -26707, - 32098, 1435, -3229, -21417, -5274, -5317, -13258, -2491, 29350, -20175, 12140, 14074, -31267, -24392, - 14181, 16228, -14900, 6906, 4899, -24615, 7350, 10914, 18311, 25476, 17514, 4109, -13118, 27352, - -21725, -2628, -11564, 24108, 30264, 11315, -14993, 17916, 1901, 2259, -28256, 18933, 24604, 30905, - -19811, -27810, 10514, 15416, 26008, 3656, -16483, -23704, 30967, 24293, -23161, -14872, -31522, 11466, - 24451, -31874, 2354, 11797, -8182, -22292, -25855, 5183, -23371, 20068, -17285, 27522, -1352, 21543, - 12426, -20541, -16788, 8502, -24012, -24163, -6168, 11734, -25695, -28363, -24550, 8899, -26438, 11174, - 7223, -6809, -2074, 27145, -7081, -1706, 15886, -26839, -22972, -26855, -4776, -31274, -1130, 23360, - -16833, -31931, -12620, -19771, 25668, -1812, -13715, 29171, -3714, -1881, 12571, 31473, 28977, -24690, - 27035, 17685, -10762, -26124, -22724, -22236, 3076, -25157, -13790, 17507, -19755, 26310, -22594, 3568, - -16560, -11769, -416, -5922, -32672, 21726, -25383, -28789, 16804, 28501, -25941, -30046, 21287, 32723, - 21950, 25116, -15690, 15653, 12944, 2835, -15944, -26686, 9057, 422, 18537, -2832, 10034, 6824, - 23353, 31322, 13289, 15970, 3849, -22843, 17799, 30077, -4713, -19324, -13297, 25774, 19258, -2724, - 18287, -26439, -16097, 22875, 31148, -17124, 2741, 11167, -19774, 26354, 3550, 13257, -16624, 30659, - 28144, -27584, -26721, 28812, -13911, 9137, -1907, 23378, -18445, -1423, 14061, 28149, -6601, -3207, - 19393, 8372, 30840, 27567, -9334, -6979, 25645, 6137, -3207, -2538, 4297, 12701, -29010, 24811, - -6725, 4643, -14654, 1772, 294, -13627, -6036, -11568, -24500, -8914, -2114, -21772, -9149, -7453, - -9329, 11538, 4523, 20716, -14536, -11615, -919, -13835, 31783, -23361, 9731, -8402, 14179, 22970, - -2899, -8322, 7742, -1388, 25804, -14797, -30635, -5450, 20480, 29791, 31271, 29421, -13755, -14124, - 29570, -9314, 10372, -18365, 7518, -31589, 30514, 11392, 15693, 5375, -27737, -6744, -19962, 9247, - 11048, 6956, -24201, -8754, -2788, -27357, 29949, 14456, 2264, -30651, 4189, -26156, 17934, -16064, - -6403, 11128, -8583, 2833, -28502, -11218, -31364, -20360, -22757, -32694, 25270, 25512, 6692, 10379, - 14378, -31179, 18330, -23578, -12749, -32307, -12605, -26307, -30085, 13746, -9886, -8517, 27206, -24013, - 21889, -15625, 28206, 1571, 15803, -18815, -4275, 30818, 2538, -25686, 19230, -24129, -8031, -5904, - 4837, -2175, 32344, 15504, -12842, -27404, 22978, 17605, 32613, 6347, 16201, 26755, 32197, 21851, - 29757, 26292, 26346, 28381, 15291, -13300, 6792, 2236, 28255, -48, -14257, -16223, -8569, -2487, - 23784, -27685, -9829, 3184, 17226, 5671, -27637, -13460, 483, 7865, -13464, 16690, -32290, -18493, - 9290, 30393, 18174, 31354, -17532, -11604, -4249, -27409, -32238, -31968, 12074, 31118, 4101, 14676, - -25251, -23595, -21338, -10566, -28982, -10189, -7457, 8244, 4282, -4451, -12709, -8314, -16081, -12961, - 22696, -10226, -28430, -4686, -2636, -5112, 9662, 2876, 32174, -18695, -25569, 24711, 8360, 27552, - 3396, 11143, -4275, -12060, -22120, -17114, 3733, -15310, -6160, -19711, -297, -18180, -21330, -7906, - 30502, -4263, -649, -21289, 17737, 30838, 16078, -7686, 14752, -23827, -11040, 28784, -10989, 894, - -12319, 31518, 32661, -29406, 31964, -11901, -25284, -24141, -24429, 8404, 5432, 18010, -17632, -13240, - 19705, 794, -888, -9818, 13640, -19838, 10089, 22711, 17051, -30104, -19772, 9661, -11348, 18323, - 28802, -26917, -6144, -15166, -22107, 29238, 15671, 21883, 10483, -9940, 25930, 15918, 22525, -29980, - 15899, -17611, 10560, 28865, -29323, 20625, 22387, 23765, 10767, -32149, 13702, -17829, -26785, 30531, - -15961, 3729, 23733, -21729, 1900, -14157, -3741, -21233, -3203, 28606, 18552, 12676, 20626, 16305, - -13761, 26732, -28007, 31606, 5864, -378, 29131, 21570, -24032, 1204, 10625, -24512, 24925, 7406, - -1878, 7176, 32244, 17473, -23929, 30920, 27599, 6713, 872, 16812, 7635, -11320, 18920, -627, - -21024, -26465, -5145, -19878, -16831, 13317, -7650, -13129, -7883, 23944, -20911, -26028, -12659, 26769, - 23367, -30898, 26183, -29482, -12652, 26897, 25613, 17540, -9044, -22050, 10437, -15115, 22095, -7971, - 10103, 28242, 13009, -28241, 27605, 20939, 323, -7103, 11139, -9560, -28590, 11110, 24871, 14602, - 11855, -6828, -29838, 2650, -7835, 13438, 18813, 16526, 9078, 374, -20709, 28768, -764, -25425, - -21727, 6653, 3544, 1670, 9337, 21383, 2496, 21731, 26838, 16664, 13912, -15070, -26140, -31962, - -18052, -210, -18958, 5340, -8297, -25528, 3068, -9793, 3917, -4226, -4624, 21460, 4502, -30968, - -26061, 31556, -16616, 27278, -10967, -14636, 19588, 9636, 24021, -2983, -28673, -2372, 28460, 28577, - 30799, 20028, -15333, 20049, -25581, -10787, 5436, -15416, 3575, -22306, -14647, -30058, 20880, 27492, - -22435, -11405, 508, 32475, 21763, 12200, -8273, -21337, -31856, 29228, -12445, 25814, -25664, 3403, - 6422, 12751, -16844, 17037, -13889, 28735, -13503, -31065, 3693, 7258, 30839, 19562, 11120, -7844, - -9651, 17383, -20416, 12754, -4155, -25773, 21629, 9486, 7266, 22984, -19888, -32710, 23413, -32137, - -8488, -732, -19817, 8055, 3133, -15770, -16057, -30569, 592, 3284, -7512, -27059, -23849, -22365, - -31042, 24219, -1520, -31065, 7677, -22508, -13902, 13510, -21748, -3554, -10632, 1036, 9083, 24933, - 29117, 15025, 26989, 5407, 9184, 17940, -28383, 7522, 17980, -7135, -24645, -16085, -8896, -29216, - -24286, -24151, -6763, 19390, 25505, -5054, 20110, -14781, -15837, 8767, -20379, -11185, -23738, -10202, - 6383, 15761, 12591, 13589, -3587, 22023, -16531, -19337, -5404, -496, 7213, 10298, -3716, 22954, - -7455, -6420, 8986, 723, 18990, 18248, 5220, 4937, 24048, 14592, 14692, 31468, -9082, 5470, - -8703, -11316, -350, 27801, 10004, -3467, -25880, -9524, 28257, -16428, -31852, -27957, 16214, 15645, - -13562, -8055, -562, -16202, 3441, 17575, -22055, 13067, 26157, 26331, 23648, 30887, -12658, -13903, - 24818, -10677, 22604, -21096, -18558, -3740, 18058, -8860, -26927, 19235, 14147, 13793, 30148, -8089, - -12128, 27976, -15794, 1915, -28834, -28553, -7194, -17255, 4511, -19084, -15654, 5241, 23445, -31883, - -3076, -9291, -26208, 22830, 20645, -31377, 3120, -6377, 7345, -11351, 5955, 13294, -9070, -10039, - 32645, 22370, 13480, 12961, 17484, 3730, 13282, 4729, 2962, 9079, -14465, 2521, -10402, -31919, - 32753, 28056, -28263, -596, -4590, -4028, 13479, 16176, 21095, 25594, -13220, -25609, -25975, 30084, - -28210, 8757, 28943, -24606, 4036, -29221, 17065, 10386, 19984, -22135, -5770, -25956, 30365, -1479, - 22883, -22313, 29714, 30273, 16665, -6466, 20453, 17843, -9225, -15029, 21522, -20831, -27346, 21333, - 13932, 8007, 15349, -5538, 19634, -6756, -12791, -15611, 7726, -27047, 13285, 4687, -2340, -11472, - 387, 13866, 14149, 1082, -23820, 6682, 7642, 25138, -23852, 15170, -28293, -24685, 23917, -6653, - 12780, 1120, 6364, 1506, 10044, -21539, -18281, -2019, -17531, -12190, -14573, -28450, 6656, 30983, - -30483, -12110, 1950, -4620, 16241, -18240, -4741, 17445, -6646, -12182, -9416, -7347, 18347, 9227, - -6510, 4677, -30765, 1485, -28648, -4735, 6017, -11341, -11306, -31221, 31324, 20254, -24165, 12670, - -19701, -16567, -24182, 5462, 17415, 10507, 10587, 26966, 9532, -569, 5442, -9532, -20210, -24964, - 3097, -19967, -23580, -530, -3160, 19699, 12728, -933, 12117, 32646, 24283, 21008, -1665, 155, - 23147, -19540, -30733, 825, 4584, -9690, -20716, -5086, 14864, 5522, 17379, -2528, -30782, 21170, - 15236, 8536, -25396, 30682, 3621, 9750, -8453, 7609, -28576, 23993, 31563, -20518, 17816, 1055, - 10285, 7615, -21606, 5188, -11081, 30293, 12643, -24247, -5960, -16337, 14734, 26912, 7239, 9920, - -25245, 2670, -999, 50, -4321, -15025, 21466, -28600, -856, -14303, -1369, 3989, -6786, 5993, - 28795, -20184, 171, 29597, 949, -24223, -7102, 24940, 2044, -9075, 10054, -9345, -19015, -5275, - -31529, 13474, -22546, 28568, 12589, -28155, -7331, 14218, -13589, -31567, -31199, 31648, 13246, 9445, - -14781, 21879, -25990, 15285, -11678, -27696, 27750, 30577, 16098, 17243, 10790, -13911, 11886, 10452, - -835, -13558, -4849, -20181, 16094, -8101, 25104, -3899, -10645, 3075, -27190, 1979, -966, -8474, - -24955, -20993, 13530, 19641, -21216, -17035, 14274, 15385, -6573, 16539, -21212, -5785, -13485, -25750, - -22097, 24513, -31183, 27270, -11388, 28585, -16123, -6209, -2216, -1967, -18362, 7533, 3859, -29187, - 7909, 31379, 29502, -24607, -15440, 12799, -29399, 10720, -7656, -15402, 4646, 11102, -5026, 6678, - -13061, -9622, 17656, -3105, -18525, 6546, 12165, 16668, 10402, 3260, -10725, -6289, -9063, 19552, - 4129, 13617, -12227, 19440, -6007, -2906, -22100, -12263, 24847, -25147, 21456, 11296, 22290, 30062, - 25794, -16995, -26755, 14234, 6273, 18746, -3120, -26647, -6804, -29057, 22311, 4154, 26244, 25564, - 25562, 32651, 25077, -17537, -12678, -11958, 2131, -9820, -6672, -1006, -24541, -5839, -912, 18241, - 18466, 8450, 15826, -7228, 27666, -750, 22647, -27577, -19362, -24952, 10196, 11755, -23473, -8662, - -15482, 20030, -2340, -20534, 20, -28562, 866, 31761, -7623, 16934, -3717, 26062, -15315, 30477, - 19144, 8049, -30010, -30488, 5620, -9774, 3457, -1430, 27301, -23816, 16139, -27700, -12183, 15511, - 10768, -21750, -18691, -1236, 14034, 22935, 31338, -15487, -20156, -449, -29600, 12502, 11001, 8955, - -10503, -10127, 8872, -1772, -30681, -16419, 22475, -20993, -16013, 28854, -16968, 8987, -6438, -13995, - -11394, -7222, 25868, 14002, -20320, -9725, 8369, 32251, -6950, 31137, -20083, 20714, 9273, -10279, - 21923, 5389, 8347, -764, 20469, 16615, -29481, 28804, 7755, -23713, -12049, -3819, 23426, 6602, - 6347, 1742, -2748, -3077, 30873, -18932, 8281, 9748, -32234, -29672, -2179, -567, 5774, 20508, - -25874, 3304, -4180, -6087, 15297, 9763, -858, 15139, 20206, -5472, 13475, -7168, 13116, 23645, - -24336, 12345, 12827, 6704, -1219, 25580, 18509, 26639, 23054, -17686, 8978, -21093, -7477, -20485, - -28008, 6759, -9089, -6930, 16862, -15365, -30111, -4458, 27914, -10217, 9248, 29311, -3552, 7398, - 19387, 15153, 20899, -14568, -3718, -22739, 25806, 1425, -10447, 1830, 32622, 12243, -14370, 10935, - -12249, 15059, -4849, 18905, -953, -22007, -25462, -31308, 2532, 13856, 19487, 16876, -12799, -1500, - 5568, 14869, -930, -12814, 12735, -17473, 16389, 12174, 17396, -27164, 2615, 18118, -29065, -16458, - 6384, -874, 3738, -12852, -17769, -17031, 22999, -11625, 6972, -22678, -8883, 1088, 11950, 30724, - -10054, -9853, -23349, 16799, -31198, -17490, 21547, 18174, -3829, -69, 24913, -28359, -13867, -6536, - -21946, -23757, 7150, 15593, -16175, 12255, 17226, -9315, 17335, 16473, 27144, -6495, 6036, 18707, - 11017, 7280, 3044, -4925, -30865, 26888, 4444, 11451, 13703, -23966, 18816, -10482, 28884, -24700, - 18097, 3224, 17275, -327, -1627, -31042, 5976, -14952, 10803, 19335, -27842, -11631, -17967, 17683, - -8470, 16963, -11210, 460, -6331, 26113, -7204, 6402, 12228, -16712, -19821, 1031, -396, -12212, - -13780, 23723, -20828, 8694, 31910, 11679, -18623, 21975, 28978, 11264, -7576, -24942, -21564, -10194, - -3686, -12679, -8108, 21519, -1043, 15201, -19692, -8400, -13050, -14569, 30557, -3671, -14428, 19628, - 31028, 3490, -23320, 25409, -3062, -13627, -19903, 22813, -7068, -1662, -1430, -24981, 26511, 7319, - 5090, -25258, 13175, -8348, 10860, 1659, 16732, -6812, 5383, 1638, -25212, 4955, -15031, -24676, - -30201, 18427, 9141, 12204, -32556, -4819, -10966, 21657, 26960, -3902, -4269, 15083, 5536, -8616, - 32188, -14975, -23290, -10203, -11710, 7514, 27145, 26994, 640, -572, 2680, -29175, -32323, 5213, - -8206, 16683, -3466, -20657, -21245, -21880, 9600, 358, 8725, 17304, -10953, -18660, -29042, -11086, - -31529, 589, 13727, 21014, -27754, 21531, 19886, -677, -30781, -19254, 30645, 5238, -23573, 4099, - -13216, -4387, 1571, -13423, -5955, -3481, 13854, 4885, -25010, -25976, -13117, 30495, -15927, 30272, - 24247, -4558, 2295, -11472, 27625, 27123, 13813, -7195, -6900, 12380, -4652, 30701, -18654, 26724, - -30533, -25717, 21600, 15481, 16125, 1369, -15404, -7850, -19253, 1678, 32418, -13009, 29134, 8562, - 26334, -24204, -14779, -32088, 22616, 31457, -8672, -21628, 11844, -15778, 22239, 13159, -20469, 15255, - 22790, 2430, 18179, 4330, 10268, -1583, -18978, -14810, 19085, -25437, 1471, -6424, -23533, 9952, - -4722, -13847, -18591, 29119, 31782, 12222, 22780, -16030, -344, -8533, 17299, 28488, 17290, -8627, - 543, 15083, 23836, 30479, -21055, -19221, -7570, -1447, -20730, -7620, 23766, 11789, -10304, 27963, - 16304, 21160, 16619, -15081, -5193, 15032, -28531, -19681, -12684, -21776, 19394, 18954, -26709, 16741, - 12590, 5845, 29823, 23387, 29025, -23821, -27130, -24117, -11355, 1195, 3478, -28289, -12327, 3370, - -27563, -4499, 1030, -3994, 14079, -26523, -2404, 26792, 26556, -19695, -15617, 28931, 26083, 943, - 3530, -22388, -23769, -7501, -3084, -24906, -9990, -8048, 29179, -18722, 16155, -28508, -26218, -17892, - 14490, -18718, -30134, -31972, -2381, -29983, 22618, 1226, 1584, -1421, -4552, -23110, -26801, -1094, - 6808, 23719, 3093, 14771, 24118, -32319, 12882, 13131, -17232, -25, -16625, 12525, -21206, -17008, - 4803, -28430, 16630, -28761, -21830, -22257, -3856, 26806, 29456, 16626, -11704, 11961, 1378, 15269, - -25588, 4987, -1787, -15275, 4694, 22140, 20089, -28907, -17399, 30769, -8032, -5068, 15337, -28107, - 5757, 26691, 28080, -30536, 26090, -26792, 1453, 32674, 3494, -2187, -26796, 3516, 22338, -9542, - 796, 29753, 19685, 19506, 31194, -30906, 3678, 18123, -2102, -26127, -13047, -31990, 6104, -10267, - -3622, -21368, 10691, 2970, -22733, -2364, -18073, -7420, -1225, 3481, -23154, 3799, -22240, -19207, - -21499, 14769, -21175, -32549, -20219, 26225, -15854, -12912, 5329, 29754, -3477, 4815, -20145, 14974, - 24647, 14061, 9079, -20790, 14879, -14767, -32306, -4163, 28536, 16209, 20127, -13506, -26983, 8002, - -778, -22531, 20675, 9383, -19181, -28798, 12635, 16694, -8931, 9161, 22969, -7105, 18291, -28847, - 11654, -7458, -1771, -24706, -19939, 19822, 5283, -26267, -18530, 16318, 6390, -18359, 17835, 24321, - -31512, 27560, 13123, -24384, -10222, 8560, -5695, 28067, -25612, -26058, -5838, -18550, -27398, 26430, - -26763, -84, -30501, -22608, -21903, -20467, -23792, 18208, -19818, 27141, 15902, -19329, -16728, -23269, - -11727, -24018, -2430, 20809, -1635, -31101, 6013, -1253, 13730, 19760, 14928, -5520, -8801, 29049, - 21954, 20347, 26096, 11326, 23885, -10048, -7763, 13324, -23287, 30240, 3980, 18330, -3669, -18323, - 1770, -25077, 28275, 22277, 22579, -9915, -11520, 23821, 17568, -12400, -16439, 4047, -30494, 11181, - -5914, 26343, -3550, 9658, 15223, 24792, -31543, 21694, -10578, 25666, -885, 24775, 21796, -10014, - -17706, -17472, -1682, -28029, 17005, 12346, 22233, 23440, 3690, -17020, 19939, -4912, -6991, -7621, - -23987, -7853, 2939, -20124, -30458, -31690, 20122, -26343, 15087, 27358, 30769, -32369, -22769, 3449, - -25081, -4288, -29948, 32335, 18828, 9423, -12982, -26720, -18244, 23820, -25415, -31872, 15969, 17348, - 1864, 15462, 27706, 22840, 4292, 15601, 18473, -24665, 23590, -7345, -11084, 23353, -2671, -13639, - 6541, -5619, -4280, -14649, -12251, 27931, 16467, -4020, 30451, 23231, -19909, -27303, -25060, 17950, - 17714, 19212, -13847, 1765, -1643, 156, -4396, -14928, 31465, 24449, -22966, 8689, 28654, 31825, - -17735, 5339, 24017, 17781, -19985, 20385, -4493, -11502, -1176, 21327, 6153, 28398, -4788, -3912, - -11520, -4661, 18416, 20411, 15226, -17346, 29668, 6011, 9565, -1493, 29381, 27439, 13446, 22469, - 18765, 7317, 6216, -24758, 1946, 29153, 23194, 11189, -25959, -15101, -27738, -7043, -17763, -22486, - 9140, 19833, -22202, 3114, 13154, -791, -16169, -25401, 8724, 4886, 18300, 3319, -9232, 26628, - -5782, -12569, -29925, 24234, 10412, 20444, 9302, -18034, -13689, 25566, 15466, -10359, -2282, -16613, - 24690, -26305, 27293, -19403, 75, 16294, 10902, -2740, -10826, 14299, -10895, -30168, -18266, 22651, - 23438, 19433, 2063, -12162, 19813, -32655, 27396, 13664, 32038, 29278, 2821, 24729, -10010, 10117, - -26357, -26311, -9395, 5081, -23337, 984, 12957, -2889, -1896, 19048, -6385, 26568, 25163, 9931, - -16377, -2423, -25836, -899, 20863, 7080, 10295, 5596, 28247, -14382, 20025, 21762, -13247, 26089, - -9690, -9530, -2857, -15156, -2326, 5025, -5870, 7412, 25328, -1901, -9014, 27714, -24558, -14484, - 13700, -3845, 3575, -14853, -29796, -28795, 10935, -18914, -10584, -9575, 28800, -25474, 24074, 29081, - 986, 14401, -2436, 8289, -18216, -20138, -23249, -23227, 30646, 8147, 19851, 30998, -8923, -2820, - 12640, 20100, 263, 16248, 772, -14977, 14908, 2185, -7087, -11312, -26167, -10111, -19559, -30584, - 32449, -6902, 11161, -26007, 19546, -26745, -16525, 9469, 7719, 3111, -21607, 16021, 21863, -15931, - 8055, 20400, -30910, 1204, -9504, 14058, -29753, -696, -10643, 17175, 23597, -9327, 23886, 10676, - -9236, 13622, -30516, -10318, -13221, -20389, -1718, -24969, -1858, -6313, -29455, 18993, -30580, -8887, - -17522, 23794, -9430, -18983, 5272, -2992, 12511, 26381, 8673, 13413, 18131, 7250, 2110, -25726, - -11965, -1975, 31304, -30530, 15636, 21437, 14251, -13037, -271, -1948, -6634, 20262, 9131, -6070, - -24290, 19410, -18728, 10431, -12744, 30733, -29993, 9603, 32119, -183, -23395, -29552, 26724, -31771, - -31188, 25778, 20322, -25218, 3868, -10344, 19362, -8071, -23767, 1881, 10094, -18702, -20814, -28683, - -9437, 23138, -2034, -23680, 19956, -22636, 974, 5839, 10261, 11012, -19098, 16355, -11905, 13398, - 31587, -26428, -27229, 21343, -18706, -24140, -10005, -28052, 22913, 9981, -11791, 32383, -31819, -589, - -3544, 27790, 9216, 5265, -15956, -10595, 24755, -10343, 6428, 6601, 16233, -3153, -18871, 67, - -15091, -24768, -19784, -3256, -24564, 27387, -11477, -22155, -26458, -15705, 7606, -5053, 1829, 28204, - -2702, -12717, 5821, 575, -208, -17235, 21062, 28684, 18679, 21924, -10175, -17063, 1843, -5001, - 2304, -29421, -12274, -17061, -7004, -3804, -8003, -5704, -10059, 706, -4315, 20490, 19261, -10211, - 18441, 28194, 1418, -32381, -30582, -30454, 8695, 15900, -10262, 10611, -25118, -1921, 6538, 21717, - -31977, -15131, -9091, 32284, -24669, -27929, -32657, -23328, -9606, -29044, -5150, 31239, 22145, 17193, - 27682, 28441, -28747, 32533, -59, 4647, 10544, 13860, 25174, -15325, 17438, 12638, -22954, 18329, - -2790, 19156, 510, -10276, -14963, 7913, -11290, 19600, 3755, 10046, 28778, 31770, -17708, -28524, - -30969, -641, -25936, -26714, 1805, -4513, 15595, 23649, 28924, -294, -30679, 14338, -10294, -29069, - 32103, -24942, -1547, -24364, 22192, -4619, -27962, -22015, -5196, 15435, 6449, -29475, -30495, -16915, - -23042, -25474, -27353, -32653, -8684, 29339, 30288, 1593, -2919, 16318, 3931, 10564, -29102, 10795, - -11794, -16643, -28168, 25186, 24914, -2483, 5703, -30398, -15539, -32562, 22014, 19241, 23392, 16167, - -19983, -20442, -11232, 13149, -15063, 32194, 28258, -12646, 22428, 7187, -18799, -10248, 17709, 5702, - -11888, -31960, 14979, 19295, -32126, -32613, -17886, -23308, -23120, 12549, 15508, 6949, 12034, -1856, - -4703, 12609, -1584, -24350, -17384, 3575, -17194, -29133, 19007, -16231, -16162, -16502, -9067, 6451, - -14441, -31545, -5288, 2471, 28859, 29086, 6089, -10998, -5438, -11023, 26001, -6911, -11176, 6686, - 20431, -32286, 12768, 7058, 30523, 6029, 22230, -19355, 30216, -2063, -8111, -26572, -20150, 4342, - -19970, 29166, 4731, -12336, 29168, 20435, -10850, 28137, 11588, -2136, -12793, 17676, -31708, 13377, - -22445, 16078, 9364, -21505, -22459, 8077, 9808, -27519, 19698, -11828, -19530, 188, 26174, -20897, - -7566, 19497, 21300, -11344, 1539, 16709, -32410, -12675, 12132, 19551, -28923, 32223, -5534, 28397, - 5973, 25169, -10222, -21237, 9562, -25265, 12300, 7560, 11598, -31481, -29145, 16942, 3818, -16441, - 5224, 8034, -30821, 10611, -27672, 13500, -16067, 7163, -20827, -15563, 6647, 8437, -6567, 30548, - -24065, 17366, -1382, 29128, -6430, -17165, -3863, 28014, 28307, 20494, 30563, 15042, 2129, 15282, - -28994, 29534, 26026, 19965, 1669, 2520, 27990, -17326, -12240, 30035, -25574, -28358, 11691, -8985, - 19241, -10439, 7422, 5069, 12200, 17304, 15325, 8821, -31930, 11440, -17370, -4620, 3633, 29907, - 8873, 31902, 7306, -9169, -20883, 2892, -13951, -12066, -22460, -6872, -11877, 31475, -13130, -10414, - 19361, 22130, 14061, -32245, 28382, 32395, 10999, -6081, 28867, -18133, 17433, 14198, -2080, -24783, - -23880, 27613, 22957, -17320, 8295, -16112, -13379, -15914, -18177, 23367, 16689, -32329, -7804, -21853, - 10003, -2765, 27017, 8373, 6416, 3227, 14574, 22836, -21687, -8461, -27168, -28910, 6783, -9931, - 32519, 268, -6352, -22265, 29370, -30322, -28860, 11768, 25109, -9042, 5438, -31262, 5309, 29132, - 26541, 7229, 9095, -6085, -13526, 23392, 32168, -12547, 30384, -25119, -5840, 6150, 25928, 17956, - 3005, -4242, -15680, 29477, -31704, -21586, -16440, 17422, -6927, -3877, 26875, 5761, -15041, -21237, - 28262, -21676, -2426, 10212, -5171, 1466, -1607, 31743, 5759, 27051, -4826, 9821, -12887, 20548, - 12434, -23491, -5301, -26830, -3433, -31186, 511, 10771, -4887, -18038, -24924, -22878, 16378, 17852, - -16386, 6050, -25962, -24184, 7583, 31392, -2591, -27848, -20310, -28739, 15031, -28943, 4732, 16881, - -23572, -17263, 29761, 4294, 6336, 21798, -19492, 13199, 4608, 18070, -7435, -13609, 22049, -18785, - 19516, 15815, -28051, -21045, 29119, -29169, 29418, 11204, -21172, 2264, -15949, -24486, 22345, 17040, - 709, -10718, 8323, -29412, 17701, -27682, -7998, -31385, 17714, 30068, 8776, -10570, 3914, 9256, - -16730, 15717, 2071, -25223, -22611, 8555, -699, -19233, -2610, -20003, -5603, 14543, -26044, -10325, - -12427, -30807, 16935, -22146, 2116, -6632, -32083, -30790, 2750, -9513, 24302, 1066, 13801, -15227, - -18620, -9524, -4346, -26641, -31259, -10420, -29124, 882, -22572, 31593, -24960, -22441, 19029, -18976, - 14398, -19386, 19794, -18517, 23732, 20647, -10795, 13477, -12915, 18416, 24399, 27595, 7721, 6587, - 1529, 22543, -16158, 16379, -7234, -31711, 27433, -21384, -9163, -14997, -8895, 11486, -26063, 8630, - -14226, -20457, 9177, 28127, -31960, -15982, 21642, 1847, 12893, -13454, -17918, -25809, -17744, -15464, - -26904, -13429, -2514, -6083, -20396, 7494, -31323, 1995, 29286, -30085, -20590, -435, -14392, -31349, - 25193, 2148, 11829, -6577, -670, 29564, 31914, -740, -7022, 7690, -5604, 10282, 16571, -20029, - 32162, 2008, 26831, -23433, 26407, -12156, 10474, 14891, -16111, 14458, -12370, -18959, 13249, 23381, - 21717, 16465, -9854, -8408, -9453, 6456, -9075, -21842, 13847, 14868, 2186, -372, 5108, -695, - -30966, 21064, -16750, -6432, -7968, 5194, -14885, 30523, 6479, -11279, 12841, 2976, -13878, 19271, - -5633, 3874, 17758, -4982, -14188, -17462, -3403, -22042, -22968, -31360, -29821, -8034, -25594, 16959, - 31878, -1672, 32714, 27582, -24089, 27731, -19512, -10338, -25335, 5606, 873, 4422, -31584, -20416, - 4050, 29096, -20267, 31441, 18149, -5910, 24352, -19775, 16816, 11954, 32021, -21534, 5615, -10448, - -20954, 6246, 8241, -8919, -20119, 29802, 3466, 24625, -2764, -10525, 12968, 12691, -26459, 4698, - -14491, 28339, 16504, 6782, 24590, -3873, 21420, -8074, 22587, -32725, -27392, -31418, 18987, 14626, - -8156, -31146, -22705, 30565, -32410, 7112, 22575, -22798, 5470, -23687, -14855, -30401, -4775, -21641, - -27783, 12529, 15518, -4373, 14033, 15227, -29760, -17509, -5464, -6404, 14405, -21745, -18178, -30010, - 28888, -15838, -23428, 3419, 13897, 12194, -8295, 1445, -5399, -17026, 13229, -13629, -20321, -10102, - 24874, -7171, 1149, -16289, 9782, 21444, 6510, -12755, 31476, 22438, -11962, -22289, 23668, 19168, - 26129, -27444, 15895, -27076, -6727, 31971, 31812, -1069, 22509, -31088, -30869, -17648, 177, -8102, - -29552, -23813, -11670, 16269, -19937, -24181, -25266, -31497, 31015, 20781, -6651, -2916, 16582, -15309, - -23472, -9077, -16444, 26387, 8831, -23041, 24044, 13723, 24587, 13955, 6688, 3845, 1690, -12521, - 1773, 27120, 25780, -27179, 19856, 22877, 27683, -13331, 449, -5604, -4218, -15818, 23503, -3967, - 5071, -2012, 28866, -14919, 29970, -3574, -18204, 13609, 577, -12234, -9546, 1084, -23611, -14897, - -8927, -30593, 25039, -15184, -730, -1042, 29475, -4758, -26598, 25219, 6343, 23689, -27803, 1332, - 11783, -11397, 27926, 22658, -31708, -5279, 10001, -30491, -1851, 23859, 15528, -15776, 16809, -32418, - -19202, -793, -9610, 1090, -4056, 10747, 11761, 27103, -17477, 9916, 27493, -20, -28424, 12131, - -14878, 30805, -4730, -15839, 14326, -5859, 24028, -21252, -12099, -8328, 10859, 27877, -3585, 1903, - 3618, -21301, 16582, -27156, 2323, -2439, -3304, -22119, 18751, 1689, -22122, 29637, 15774, -32188, - -708, 8220, 12076, 22982, 13450, 2095, 4859, 7696, 17818, 5406, -9026, -18244, -19952, -3816, - -9149, -21622, -11328, 15891, 18749, 3190, -26680, 24293, 17059, -30734, -17084, -14465, 4532, 17277, - 24688, 21832, 19744, 17894, 8637, 12192, 19365, 31589, -30007, 29508, -23732, -17807, -21908, 19261, - -20045, -28247, -28244, -582, 5953, 14015, -23899, -9989, 21283, -279, -25761, 25837, 11247, 23493, - -9585, -23639, 21653, -25390, 30874, -7847, -26861, -32026, 17673, -6443, -4126, -7272, -23186, -28068, - 25005, -19392, -10119, -24355, -24887, -11300, 12967, -20386, 18243, -7979, 9964, 26536, 22436, -29546, - 1604, 10433, 21997, -5004, -10828, -1704, 18769, -48, -882, -12853, 19875, -20207, -10932, 2362, - -11542, -17109, 20647, -27445, 1164, -17374, 17044, -10624, -211, -10986, 23220, -1105, 24208, 32525, - -19568, -7316, -32173, -14604, -26378, 24757, 30121, -4855, -1717, 17864, -28222, 30548, 19756, 14805, - 30891, 31289, -1984, 25254, -32663, -7536, 9060, -28829, 2307, -29056, -26552, -25593, -5440, 15623, - -22384, -8323, -1551, 29606, 31272, -29231, 25164, 4875, 2494, -21801, 7954, -13412, -563, -8921, - -31106, -20177, -12306, -293, -9334, 16996, -10975, -6502, -8821, 22732, 28197, 19338, 22871, -3293, - -26773, 5289, -9762, -11720, -19419, -10984, -6839, -12582, 9005, -27437, -17108, 5399, -21088, 27275, - 17571, 21553, 3642, 21096, -2192, -20229, -25358, -6055, 7327, 2250, -10223, -9120, 3335, 19983, - 27462, 20420, 12229, -28927, 28755, -25447, -18239, 2681, -15427, -13175, 4815, -30346, -5284, -8098, - 3191, 1022, -10585, -29128, -18859, -26922, -17872, -9917, -21669, 22791, -29566, 32545, -8196, -17344, - 21484, -2484, -31825, 30269, -1845, -14066, 863, -1933, -14196, -32370, 21944, -23154, -16920, -20148, - -5232, -17069, -32292, 23401, 12704, 15457, -14527, -31268, -14146, -26750, 26225, -6024, 10881, -8740, - 26090, 32105, 2382, -4332, -16072, -30009, 23143, -20117, 24827, -3498, -22743, -17105, 20151, 29647, - -11847, -7172, 2478, -32134, 19748, 9812, -3456, 21889, -13087, -12683, 29181, -10467, 6907, 12194, - -11838, -6264, -22419, -19737, -6311, -28978, -5641, -22878, -10380, -29512, -3793, -18674, -23839, 11788, - -25428, -3701, -16174, 19677, -2528, -20883, -10862, -12207, 23552, -25770, -17065, 11744, -8001, 13393, - 11488, 23823, 3066, -1437, 12179, 15488, 1502, -32702, -18568, -20030, 16511, 32456, -18137, -20961, - -8433, -3209, 1705, 3954, -15145, 15336, -27772, -14924, -24691, 5943, -2535, -16822, 12541, -1022, - 25826, -22310, -20416, -23803, 215, 20946, -20109, -11228, -23282, -3612, 2915, 9897, -25026, -30524, - 23506, 19079, 6954, 25132, -25384, 17083, 5394, -31385, 2761, 2005, 17113, 4296, -18220, -30671, - 32011, -18351, -28626, 13096, 15352, -132, -24256, 13264, 8557, 19805, 26273, -212, -28031, 26032, - 22960, -13418, -31896, -8578, -16622, -28436, -15632, 29942, -24919, -2387, 14615, 7958, 22369, 21044, - 25110, 26173, 5000, 9949, 31509, -19467, 25340, 11379, -18436, 12286, 1140, -31445, -6798, -32724, - -27596, 79, 28347, -31386, -2640, 5670, -25785, 21381, -26606, 8803, -7740, 23911, 11707, 22907, - 31318, 3913, 16679, -27581, -15363, 22143, 5587, 8950, 29147, 3972, 8335, 5847, -3091, -20005, - 31450, 27149, 10574, -18395, -19554, -32589, -30808, 14078, -22474, -28044, -15823, 7886, -4506, -4849, - 15639, 17827, 14912, 30334, -25989, -9926, -29491, 18554, -1918, -30877, -8726, 19594, 6054, 8863, - 26955, -21934, -29636, 28649, 4995, -2931, 26748, 694, -811, 18149, -8270, -25411, 9801, -18974, - 8914, 30440, -15843, -7812, -31048, -12372, 4596, 31786, 3489, -21322, 3879, -20652, -18821, -4738, - 3710, 32227, -10407, 30981, -30330, -5152, 25538, -23442, 30559, -6499, -26781, -20568, -28371, -32062, - 18066, -15534, 28116, -29350, -19216, -19924, -21691, -30522, -31301, 2911, 31717, -29873, -4065, 26790, - -6175, 32495, -21685, -5234, -27179, 9013, 3428, -4219, 662, 2032, -24056, 19986, -14071, 3095, - 16701, -12890, 1901, 9077, 16762, -9572, 22946, 15107, 23414, 1252, -15342, -17811, 3054, 16320, - -2702, 696, 24438, 9566, 20865, -31234, 30544, -16096, 32162, -20032, 21265, 14497, 15747, -31544, - -10649, 27614, -25582, 1894, 19500, 3481, 10396, 18902, 31960, 6516, -22502, -1720, 18355, 16485, - 7609, -4226, 24598, -29435, -24680, 10934, -25868, 16729, 28427, 12466, 7841, -27508, -29725, -2540, - -26296, 24260, 25119, 15934, -24045, 23537, -13104, 32291, -8551, -23493, 29534, 28583, 14665, -3310, - -15758, -22342, 28589, 32484, 23006, 29827, -1862, 23561, 3447, 22661, -7738, -27441, 27135, 25675, - -21581, -2828, 20038, -10021, -1572, -9774, -27084, -18068, 19844, 13540, -12571, -8266, -26654, 15332, - 25297, 26334, -13959, 18477, 20613, -13592, 29182, 27128, -9717, -8401, -20334, 11540, -24538, 19817, - 21590, 26506, -24997, 25114, -19334, 28521, -8147, 13879, 16661, 14511, 13061, -11352, -31633, -26629, - -13391, -30793, 19649, -21743, 28171, 14231, 5757, 17607, -19931, -18603, 8491, -29248, 7372, 1181, - 4762, -24264, 13233, 2884, -24126, 28913, -10977, 14641, -18846, -27602, 13371, -31489, 19060, 21082, - -20156, -20428, 3826, -7010, -16581, 2226, -7666, -11961, -6369, -7069, 18373, -28251, 622, 32758, - -15928, -17550, 12032, 14588, -29268, -7209, 29200, 4946, 18946, -828, 4147, 10932, -829, 355, - -12191, -28950, 20647, -8347, 12518, -9946, -26646, -28829, -10828, -17389, -5556, -24981, -17551, 23444, - -20502, -7698, -31117, -6452, 28791, 999, 11858, -26006, -29692, 24945, -15473, -30971, 30767, -19079, - 12944, -23840, 29323, 8877, 2456, -17834, 2539, 10153, 18271, 20167, -7452, 22310, 17992, -26541, - -822, 8650, -14885, 16800, -22870, 30187, -27764, 11369, -21875, -5321, 21365, 30096, -2925, 13355, - 13512, 25596, 3925, 28568, -21189, -25099, -5656, -18539, -27133, 339, 29452, -22493, 10521, -2303, - -26074, 1492, 6181, 13289, 26428, -17957, -20479, -23423, -13514, -23601, 28162, -27680, 29599, 21145, - 12744, -31116, -6557, -5273, 15747, 32476, -4310, -11511, 16118, -8448, 21310, 20265, 6312, -3011, - 21821, -5774, -1739, 26885, -14211, -5742, -7880, 8054, 18019, -7375, 9382, 16467, -21831, 25481, - -14842, -7910, 19204, 22483, -18862, 19705, 32093, 8225, 24322, 24882, -5449, -31648, 498, -21952, - -10126, -18406, -2044, 32707, 4615, 10402, -7845, 14213, 5520, 9025, 2560, -15835, -22472, -31985, - 89, -16799, 26080, -32568, -20215, -5633, 19577, -5349, -4333, 8227, -1994, -32460, 2518, -21523, - 16976, 30741, -5088, -2559, 8463, 131, -13928, -9503, 32471, 13135, 30726, -27784, -1875, -19573, - 32655, -28095, 26473, 31387, 16118, -8217, -18617, -15863, -5249, -31997, -19235, 4422, -219, 6005, - 17547, -30440, 2260, 10285, -10237, 1215, -1755, -11761, 13153, -27225, 16260, 28424, -15071, -32319, - 31185, 18357, -4283, 22425, -2235, 4770, -13983, -13413, 14873, -28303, 2046, 17787, 22567, -28600, - 514, -5327, -1410, -14850, 30311, 5077, 8530, 8079, -10297, 26066, -26737, 23280, -5319, 1480, - -609, -14783, -32643, -22594, 26267, 26678, -15542, 31062, 452, -22391, 22810, 19885, -30111, -14721, - 31792, 31875, 17132, 28189, 31114, 4560, 24848, 11539, 11286, -21396, -29789, -23489, 28566, 32027, - -2330, 28583, 31932, 4757, -28636, 27917, 13621, -6641, 19490, -29074, 19831, -213, 15760, 2399, - -27323, 28613, 31409, 4267, 20693, -27158, 17548, -19513, -31238, -28908, 1111, -21729, 3247, -19670, - 26934, 28654, -30959, -29166, -19508, 29131, -32342, 4407, 20644, 6129, 16648, 28957, -32149, 21659, - 27726, -2370, 24528, 15407, 18681, -18807, 16654, 17182, -20239, 25523, -26833, 13275, -31499, 27039, - -1826, 8587, -5846, -8624, -29433, -8012, -7181, 3708, -2549, -5666, 6471, 29476, 10646, 23230, - 13115, -15266, -18885, -18763, 28903, -6663, 6119, 10315, -25968, -10691, 28790, 1294, 12662, -18205, - 23996, 29904, 25938, -23747, -25319, 24309, -15286, 4957, 16173, 15584, 3085, 28169, -10857, 2374, - 7447, -29602, 6585, 24229, -16613, -32627, -32560, -13737, 15114, 3291, -30359, -12485, 28959, -25962, - -26048, 30804, -28113, -28595, 21017, -27871, 11634, -24375, -17178, 27575, 16123, -10181, 15582, -14315, - 12445, 28268, -26969, -7899, -1929, -12356, 5726, -30655, 23609, -32411, -3008, -11465, 22827, 25506, - -4652, -5073, -20370, -22100, 2180, 2948, 13698, -6676, -25125, -21172, 23146, 25305, -30740, -14592, - 23756, -21013, -13778, -31920, 20790, 18669, 14957, -30471, 9903, 17238, -7322, -13459, 14227, 17965, - -8003, 14908, 1843, 29227, -3468, 11647, 15761, -2826, 1220, -13765, 23087, -22687, -23851, -16548, - 13651, 18240, 9410, -25235, -10172, -16298, -27055, 30730, -9764, -2720, -18617, 26768, 18650, -255, - 5310, 4061, -21748, -27035, -19706, -65, 21901, -30338, 4598, 22777, 975, -11012, -8915, -17108, - 5670, 22275, -16851, -23669, 30436, -7614, -30853, 5536, 13763, 710, 13314, -2651, 17862, -29506, - -31743, -2442, 11856, -1714, -13132, 5746, 1922, 26271, 27629, 27136, 7720, -27684, -31310, -20519, - 30866, -1191, -15797, 19205, -4892, -14140, 17565, -9020, 30911, 24158, -21954, 25710, 32074, 23258, - 11037, -25791, 24910, -32264, -16418, 20502, -9551, 9270, -25432, -27714, 12630, 12158, 16723, -5120, - 3014, 14875, -11402, -9896, 12694, 11774, 13229, 27680, -16466, -1964, -17823, 9974, -29348, 24763, - -16178, 23374, 3144, -14040, -9394, -21993, -25043, 27312, -25580, 30107, -29506, 29275, 9178, -16455, - -23880, 30302, -20572, -12213, 12826, -22948, -9905, -11565, -6842, 6420, -20649, 1700, 20962, -25149, - 11694, 18138, -24962, 17285, 3215, 23126, 32377, -12781, 32402, -28776, -8116, 16831, 12918, 7105, - 18796, 25944, -3792, -18527, 24333, 20943, -10879, -2488, 11327, 26205, -14267, -27060, -12422, 5689, - 8308, 19417, 6476, -26656, -25266, -30818, 20729, -4962, 1806, -32693, 24386, 15519, 14956, -16236, - 13415, 23247, -16191, -22758, -2909, -24922, -9178, -3096, -31277, -21427, -8206, 16196, 7617, 31519, - -18465, 22273, -3948, 28866, 5268, -15983, -2333, -18352, 29333, 15384, -31109, 17578, -15014, 3449, - 21544, -22231, -8239, 9162, -32036, 12858, 10417, 17794, 20213, -25984, 7098, 10528, 30960, -14995, - -10392, 12700, -17722, 5628, 2577, -5999, -20641, -10365, 16896, 25353, -23309, -3374, 24620, 9706, - 24819, -2093, -20887, -4677, 25538, -19865, 9875, -2983, -3566, -22150, 22448, 7736, -20314, 25959, - -9520, -15408, -1438, -25395, -31677, 4944, -26829, -20872, 3871, 18486, 21325, -28582, -22181, -22938, - 8341, -32180, -30578, 20984, 24549, 15585, -25665, -9396, -12479, 31258, 18177, -17903, 22684, 4653, - -7439, 28820, -8538, -15384, -28809, -16295, -14943, -11961, 18963, -22921, 15021, 826, -26559, 3010, - -7061, 10352, -21921, -27170, 12188, 1546, -14950, 10218, 5942, -6447, -31017, -26435, -26584, -14121, - 30530, 18041, 10911, -25791, 32289, -31115, -25470, 26990, 126, -14022, 16182, 9694, 23242, -30611, - 20968, 26577, 30994, -15490, 9760, 4259, 23398, 10865, -3781, -22103, -3626, 17761, -17852, 26660, - 16269, -6255, 24277, -18581, -21479, -2910, -12988, -11995, 5412, -17106, 31330, -16768, 20755, -27851, - -8911, -7168, -2754, 8705, -30765, 14799, 17839, 7776, -30898, -32304, -7383, 25934, 16153, -20864, - 1446, -6834, -16859, 15474, 30427, -25369, 7783, 14967, -19310, 5876, 1892, -19855, -30096, 32277, - 19760, -21626, -14699, 15615, -25747, 15493, -7979, 23089, -16017, 16250, 20541, 23759, -17921, -5357, - 16854, -9944, -2820, 15430, -31110, 6340, 9529, 357, -10037, 23859, 16477, 7309, 21181, -9472, - 8820, 6257, 1255, 17113, -2875, 14440, -27203, -13554, -19220, -19184, 26889, 23917, 30423, 917, - 3609, 16167, -19806, -16640, -26677, -2566, -12550, 23797, -20427, -10344, -29839, -15042, 8380, 12076, - -14034, -25952, 3375, -32059, -30067, 10991, -12969, -28615, -24339, -20495, -18927, 4577, 23276, -21784, - -27414, -29193, 5918, 13395, 21588, -28343, -12921, -3257, -31705, 7834, -24029, 6768, 21636, 17286, - 9659, -29557, 5141, -8223, 6613, 11617, 5274, 11160, 31536, -15162, 9693, -14630, 12433, -11328, - -21572, -3302, 15387, -2750, 27053, 23151, 2978, 29731, -12618, -5737, -25817, -7413, -7535, 15010, - 18965, -7713, -14013, 14802, 12432, -31411, -18255, -27102, -665, 5841, 21370, 6162, 18962, 23600, - -19381, -23713, -17110, 2929, 17682, -1634, -6514, -262, 252, -6579, -8947, -2518, -15381, -6920, - 9045, -28815, 24429, -18891, -2132, 26225, -7866, 3930, 17689, 1964, -22637, 27613, -18095, 12363, - -1113, -22297, 4031, -6415, 21087, 2156, 488, -15656, -26234, -30736, 28089, 4349, -8714, -5167, - 3581, 18910, 26706, -32460, 1951, 10214, 6095, 29649, -12827, 4203, 3849, 18068, 9907, -27259, - -6624, -23039, -8201, 12499, 5202, -9820, -25309, -7913, -32362, -14642, -8575, 20376, 21754, -31157, - -10403, 27504, -15205, -21953, 24838, -31403, -16444, 1749, -7099, 1590, -533, -25098, -31626, 27466, - 24569, -993, 2057, 22047, -22651, -8413, 1423, 31898, -22292, -21525, -32244, -17527, -27556, -24731, - -20700, 10095, 20203, 30575, 28916, 19458, -13480, 14093, 16570, -12980, 31246, -32464, -2957, -20966, - 14250, 26811, 16054, -14824, -21741, 952, -25384, 25664, -21614, 24270, 6640, -2248, 27294, 14747, - 15683, -17566, 30549, -21031, 13712, -31611, 31716, -22982, -19890, -8042, -7473, -31361, -8649, 7055, - -19825, -5879, 7479, -3728, 11722, 30523, 31078, 26332, 10829, 24286, 22789, -20638, -22539, 29790, - -14080, -13101, 31809, 20103, -15665, -2162, 12158, -21970, 408, 9778, 27461, 3458, -7416, -11425, - 6741, -29440, -25397, -5998, -21520, -24019, 22801, 12068, 28393, 13134, -24432, 21433, 24271, 689, - -9620, 15303, -25732, -9863, -1968, -26992, -15268, 13136, -13045, 22469, -8160, 4766, 15595, 27714, - 16196, 18454, 16259, -27229, 6645, 11339, 9696, 22277, -17842, 26806, 19356, -14926, 18103, -6605, - -30802, 4767, -19122, 10088, 3698, 7876, -26559, 13251, -146, 29488, -14514, -18949, 2760, 32503, - 9451, -295, 3538, -5901, -2148, 17671, 8897, 10616, -14044, -3946, -18066, 4572, 23825, 7192, - 12612, -15626, 26021, 1852, -4045, -30154, -25104, -18098, 23676, -19571, 26460, -20482, 15197, 30564, - 4320, 25823, -29894, 1714, -29454, 15531, 15882, 15668, -2195, 371, 17152, 9789, 31201, -17297, - 6160, 28001, 31530, -1435, 24658, -1731, -29122, 7366, -5524, 11964, 1627, 22169, -16097, -6833, - -4758, -2115, -14463, -14321, 2456, -5953, -19529, -30544, -27296, -13275, 14608, 12592, -22675, 9238, - 11701, -21656, -18666, 27679, 19574, 21540, 1520, -29245, -28885, -16704, 27741, -12432, -21153, -5626, - -5212, 20628, 12860, -9516, 4416, 22901, -29194, 17639, 14454, 16360, -19621, -9005, 14732, -15723, - -21335, -15908, -2590, -12642, -7632, 7607, 26295, -2980, 13553, 1879, 12492, 28220, -5000, 20030, - 13371, -10720, 8543, 18878, -16947, -3954, -17376, -32690, -14191, -24092, 11945, -15324, 21280, 12266, - -22876, 26042, 4255, -27685, 24580, 7135, -14131, 12744, -10648, 2066, -16677, 24517, -580, 1729, - -32678, 24215, 29429, 4526, 3892, -15311, -30903, 22424, -12848, 8077, 29497, -10222, 25432, -24074, - -9051, 28028, 25476, -17436, -29560, -6931, -2490, 29640, 7566, -27141, -24593, -2493, -17214, -9329, - 7187, -28782, -4099, 19592, 21109, 18404, -18930, 10703, 22209, 18746, -15310, 12778, -27411, 21181, - 16009, 20510, 8841, -2725, -9101, 28845, -4090, -12548, -559, 28955, -24775, 28315, 4905, 15573, - -19479, 20633, 28654, 20536, -22983, -22892, 28661, 10094, -3744, 16039, 18970, -11401, 22554, -18810, - -169, -24252, 20195, 23514, -8908, 2113, 23289, -19281, -20558, 27914, -19130, -14993, 9505, -16927, - 22222, 14834, 3314, 21242, -3189, -10608, -15796, 12569, 22470, 3196, -14832, -19212, 24209, -30976, - -18153, -4964, -4199, 11785, -15480, -3898, -20290, 6893, 5269, 29697, -25249, -19002, -2380, 264, - -31440, 15861, 30864, 4743, -14418, 16842, -20058, 21555, 32315, -15036, 3817, 5931, 6867, 26804, - -3846, -22709, 28532, -12396, 13989, 11298, -32275, 19088, -3341, -18372, 12306, 26468, 25490, 15058, - 13250, 7541, 23303, 12557, -31051, -17693, 31236, 28766, -260, -29301, 7018, 26680, 13055, 32207, - 7360, -4331, -32757, -12250, -2991, 28949, -15322, 9748, 11881, 18752, -11001, -22324, 22644, 21723, - -26139, 18444, -23065, 31297, -4111, 18637, 4057, 27665, 32147, -11652, -204, -3055, 17052, -1552, - 24117, 13930, 27285, -22975, 22766, 29361, 1306, 26232, -26273, -14250, 12489, 27539, -15506, -21803, - 24052, 24111, -12329, -11787, -19081, -15517, -28546, 22758, 10163, -18924, -21471, 22275, 5633, 11767, - -14622, -15376, -4826, -8001, -32685, 5048, 1604, -25491, -28641, 25386, 15313, -30041, -3102, -13157, - -25626, 14156, -96, -28210, 9149, 11376, -24542, -23494, 24458, -26686, -6821, -31932, 26469, 27364, - 28609, 10390, -26283, -1743, 9312, 28022, -21485, 8320, -22527, -11861, -7524, -21352, 3022, 20254, - -29508, -10977, -4977, 19090, 8783, -13200, -15959, -4489, -19149, -13321, -25863, 12030, 25841, -20282, - 25471, -5702, 23416, 22446, 13654, 25282, -17610, 25661, -1219, 9067, 23128, 4382, -31116, -31485, - 4790, -8039, 19961, 10739, 22896, 20428, 4360, 2671, -29559, -8539, -29990, 12601, 4765, 19987, - 13057, 8032, -29873, -4531, 21091, -31883, -20578, 24181, -16778, 14512, -4418, -16171, -27116, -32389, - -24671, -31091, -29040, 18493, -6976, 123, -9609, 14255, 8905, -30481, -17464, 12723, 29299, 23582, - -7220, -25311, -28759, 12484, 23525, 5807, -27399, -21764, -9434, -1293, 10631, 14668, -31596, 31645, - -12351, -3127, -28898, 17340, 19573, -22807, 7772, -12855, 12038, -9368, 20286, 17358, 8432, 1278, - 23442, -240, -19001, -10357, -10012, 1258, -717, -2840, 861, 18780, 22112, 23059, 25696, -22774, - 17003, -18001, -24858, -26316, -22751, -15586, 19493, -15847, -26095, -19751, 27131, -4061, 32168, 1758, - -16368, -29654, 8916, -11175, 21976, 21913, 17872, -25324, -24754, -11612, 25611, 2635, 5611, -3415, - -18169, -13003, 30180, -7712, 32445, -1973, 25103, 14855, -7025, 21021, -20571, -2516, -14652, -1824, - -12566, -13588, 26027, -8864, 12615, -17447, -7008, -2582, -12603, -12825, 25467, 17471, -432, -25713, - -30098, 24389, 18063, 3472, -1186, -12570, 6227, 13565, 10700, -19092, 22837, 5864, -10628, -7693, - 13449, -7455, 16012, 15539, -26852, 22879, -4982, 5466, 20095, 8991, -13893, -23666, 18017, 3587, - -15983, 1587, 27767, -19012, 29517, 17603, 32478, 5237, 31148, -27681, -7646, -9175, 11981, -6912, - -31420, -18395, -13123, 6368, -30254, 15605, 4926, -4414, 24023, 210, -1840, 2604, 3491, -11441, - 10798, 16144, 8718, 26715, 1676, 25677, -2776, 30228, 28157, -16733, 5428, -31253, -15900, -2619, - -15098, -10349, 18501, 13057, 1868, 32752, 17771, -23175, 2354, 30557, -8745, 15660, 10567, 22892, - 15028, -582, -18077, -23291, -18848, -16704, 5076, 1540, 1107, 21347, 31065, 29761, 3613, 11034, - -29322, 11084, -14794, -11618, -3413, -30918, -17719, 18779, 25744, 9207, 12359, -22279, -25774, -21333, - -10080, -8388, 247, 4164, 30434, -4094, -22858, 30319, 10725, 21614, 6127, -15822, -13298, 18335, - 16473, -26905, 4854, 1372, -20205, 7204, 1036, -14309, -17259, 27243, -10730, 8467, -6893, 18603, - 27555, 15367, 23211, 18290, -3894, 27378, 15313, 15354, -30748, 3018, -26331, 6447, 31439, 3463, - 7944, 9613, -12654, -15631, -7543, 18123, 26236, -10458, -12747, 28481, -4585, 3331, -27822, -29573, - -19589, -28873, 5156, -13609, -9916, -16430, 32493, -5721, 3335, -31701, -29457, 30048, -2002, 18934, - -26946, 7473, -17658, 10204, -18749, 12393, 13219, 21803, -20849, 27258, 13640, -3455, -5970, -27622, - 19940, -21972, 3537, 13205, 26924, -14538, 8808, 16477, 25290, -5736, -20863, -14270, -18527, 12181, - -32441, -29504, 24909, 13326, 28668, -23666, -22547, -14724, 5672, 6855, -17033, -2794, -17717, -23515, - -13549, -31046, -17542, 12019, -8844, 12154, 17038, -4496, -9334, -3385, 28999, -5118, -1687, 31512, - -3628, -21955, -423, 19442, -4322, -31428, -26545, 18169, 25882, 19195, -17821, 23232, -15219, -14014, - -5429, 30421, 10806, 30185, -29447, -19343, 21738, 2362, 22943, 18870, 17889, 12726, 3335, 15065, - -8638, -29936, 5348, 16969, 2351, 6480, -18753, -25850, 8236, 15010, -2176, 14970, 19003, 31696, - 25566, -26787, -22180, -11468, -19634, 20077, -16231, -25109, -8099, 2629, 18965, -14803, 24671, 20612, - 18264, -16387, 13722, -10353, -14313, 19243, -4293, 20947, 6864, 22448, -163, -23724, 8427, -2969, - -4101, -7059, 19943, 20209, 24453, -17678, -388, 21139, 24557, -25914, -21987, -13323, 19226, 31387, - 15368, -23756, -27916, 7517, -18899, 12643, 9846, 9084, 32613, 1105, -14073, 27016, -7041, 26411, - 28427, 6122, 3855, 31430, 17400, -26024, 26092, 95, 13129, 27736, 14015, -4666, 29225, 10422, - -5613, -31923, 15692, -31784, -10081, -6473, -21197, 31559, 32462, -29206, -4131, -2945, -6069, -17830, - 2048, -10079, 3332, 2939, 24456, -24942, -18904, 30406, -22801, -1265, -6968, 4836, -4476, 13957, - 10859, 4055, 3972, -11773, -5276, 28143, 30605, 8427, 16628, -6165, 25335, 2400, 7759, -29965, - -27625, 653, 9688, 24168, -2043, -21448, -22414, -16298, -30057, 8012, 31764, 30662, 17972, -12553, - 5564, -13396, -29646, -25199, -5670, 28723, 30419, 16800, 13878, 25814, 2654, -2652, 6813, -5804, - -29147, 21129, -5879, -14771, -4812, 9804, 29272, -26644, -2662, 15126, -4648, -13506, 28777, -7274, - -31255, 9586, -25306, -28414, 7065, -27418, 25906, -12521, 17176, 19346, -28706, -245, 23372, 939, - -9905, 12099, -22047, -7321, -95, -30270, 10944, -16005, 26668, -3742, 5733, 22057, 26330, -29732, - 963, -23364, -22858, 11830, 14584, -10422, 1346, -8973, 654, 32232, 24708, -15743, 21358, -12184, - -6596, 26298, 358, 4203, 23013, -13028, -6791, -12788, -9899, -6646, 2121, 28667, 4730, 1236, - 5467, 8236, -31955, 12148, 3838, -17519, -24029, -23235, 28842, 25311, 25376, 22743, -2756, -9927, - -27558, 3928, 1164, 27532, -21335, 29971, 16768, 4503, -28260, 9927, 2601, -12121, -9182, 7186, - 24822, 11766, -23384, -17674, -7025, 15825, 9935, -15510, -1398, 30538, -17847, -13109, -32302, -3074, - 4797, -18509, -9477, -16269, -3261, 31101, -9303, -1894, -927, 17114, 8242, -7080, -30042, 27095, - -18813, -17342, 15759, 24768, 18198, 19140, -26257, -21260, -18322, 10453, 3117, -12354, 7089, -24315, - -24273, -3616, -15466, 724, -28751, -4042, 16460, -30817, -30642, -12840, -30654, -19939, -27358, 26249, - 28004, -21778, -31243, 11115, 23760, -7458, -6439, 719, -20495, 4123, 29157, 11516, 31043, -15800, - 7411, 7839, 5617, 19411, -23810, -6870, -25398, -10802, -8874, -19003, 4916, 30942, 2122, -18968, - -5459, -25658, -30081, -7375, 23606, -19177, 26172, 4866, 10038, 19975, -7973, 30177, -11077, 29535, - 28472, -16770, -25071, -17693, -8868, -19367, 3924, -1852, -1857, -3903, -5734, 9773, 613, -31901, - 7520, 8508, -12483, -29267, 30463, 8325, -15195, 20652, 6494, -17865, 6255, 29534, 4121, -7775, - 25544, 7719, 1568, -17788, -21667, -2050, -6841, 32413, 8516, 17263, -27290, -2029, 5418, -5872, - 8014, -12148, -23453, 5512, -2532, -27806, -7511, 1224, -4340, 18547, -11544, 21430, 24890, 30051, - -2961, -3727, -857, 3707, -25096, -18753, 6055, 16541, 8020, -30885, 6120, 28925, -9463, -24580, - -31289, 8066, -17563, 26379, -15826, -18107, 28193, -3432, -32186, -32714, -24105, -16993, -17521, 2281, - 23255, -8557, -13251, -27193, -13607, 10851, 18602, -1072, -26911, -3053, 7337, 15968, 9511, -24644, - 9471, -22848, -30667, -24893, -31383, -8926, 14902, -29288, -17638, 16831, 16976, -32504, -5676, 9668, - 24333, 8648, -14529, -8848, 26529, 32494, -17511, 8018, -29246, -19307, 25110, -31677, 32128, 5624, - 16, -32028, 7020, 15453, -3952, -30271, -3286, 407, 7830, 10041, -21842, 29901, 1205, -16352, - -23390, -13144, -28715, 15984, 14580, 32344, -6947, -16787, 29357, -5337, -16076, 1113, -26561, 6601, - 151, 4202, 23252, -17687, -32230, -7072, -1681, 2860, 682, 1543, -29964, 22411, 20073, -6481, - 11373, 8474, -30273, 8920, -30683, 19712, 16001, -24081, 12046, -29384, -17413, -32621, 138, -18875, - 24152, -14127, 3751, -26392, -10378, 2100, 11980, -17650, -32487, -28529, 6385, -12634, 20523, -13766, - 27003, -16240, 9135, -31561, 27493, 23800, -3391, -9401, 25685, -2014, 24071, 11844, -26424, -18414, - -15000, 9548, 10714, 11031, -30332, 26332, 23443, 169, 22876, -31704, -15360, 14418, 24261, 31817, - -14729, 14777, 9715, 6647, -22711, 13090, 10404, 4384, -28513, -32291, -16262, -11989, -29983, -29574, - 28338, 25536, 24556, -8447, 1702, -31550, 18893, 11428, -13362, 24328, 1893, -17657, 26467, -25658, - -3691, 25329, 5403, -430, -30957, -6534, 21938, 91, 1000, 26567, 13417, 17837, -11865, 12866, - -20652, -24335, 7155, 17151, -29270, -20763, -24155, -2672, 11947, 20565, 30907, -27824, 17969, 16520, - -7879, -22965, -5134, 18957, -12326, 15474, -4613, 30499, -7739, -19243, 16115, 4078, 1842, -21122, - -22316, -23466, -26170, 31926, 32474, -23793, -3254, 12074, -3048, -11743, -5202, 14954, -5723, -30230, - -12421, 6453, 10197, 14440, 10448, -7185, 16070, 3948, -2549, -10548, -20141, 6308, -14394, 28189, - 921, 22689, 13511, -16228, 25007, -13334, -21699, -12004, 8508, -23994, 22924, -13060, 17190, 551, - 4807, 7038, -3138, -20771, 1319, 18871, 20545, 21333, -16068, 23385, -22810, 9829, -18809, 9828, - 4469, 2141, 10370, -11884, -19392, 8968, 7531, 31799, 13122, 2439, 28010, 1454, -14870, -9800, - 12794, -12062, 16177, -4093, 15362, -15174, 7665, 10692, -27962, -9240, -12083, -22093, 25696, 18982, - -20510, 21659, -9452, 11819, 20718, -1654, -27649, -26695, -10572, -1156, 10008, 15061, 17386, -19489, - -670, 28140, -29581, -7572, -3768, 7317, 26139, -14370, 26884, -10592, 24030, 13443, -12233, -31982, - -23828, 31852, 22999, -11023, -20721, -25799, -8842, -25911, 12565, -7155, 1342, 3846, 31473, -14352, - 16551, 14990, 25390, -22768, -16358, -18512, -17876, -28734, 11174, -5511, -23099, 23925, 28196, 6821, - 32349, 29299, -10947, 14382, 12658, -13930, -10748, -19681, 15232, 30983, -9312, -555, -19263, 29286, - 27642, 25290, -19993, -12653, -27333, 10872, -20915, -9316, 19239, 10843, -30933, -11416, -4777, 5828, - -2139, -21740, -4740, -5190, -17508, 11815, -6749, 28428, 2106, -8210, -18551, 26, 29116, -7407, - -3017, 12718, 22779, -3284, 480, 31503, 684, -24127, 6562, 11127, 18137, 15459, 23531, -25223, - -883, -21461, -29747, 10889, -2834, -30305, 5917, 29453, -28847, 15882, -14266, 9875, -1684, -13775, - -4664, -5198, -17499, 5881, 2315, -24788, -23264, -2706, 518, -31809, 7970, 26390, -15571, -12445, - 25005, 21757, 29638, 17197, 31116, -4425, -606, 31587, 1932, 21794, -18418, 6542, 30363, -13347, - 5347, -24477, 17586, 31764, 5979, -10047, 5464, 12919, -23964, -6725, 11197, -3032, 783, 16554, - -31582, 18148, -12049, -4606, 5903, -24028, 25745, 9377, 11951, -711, 3928, -20841, 27349, -6856, - -7693, -31881, 19211, 20004, -9356, -29920, -20767, -14602, 25827, -17131, -5866, 25966, -19052, 1148, - -11553, 3877, 1228, -14663, 22848, 12323, 19637, 28405, 602, -23268, 560, 14068, 7331, 23645, - -11022, 18640, -28908, -8764, 31165, -19029, 29190, 10719, -31417, 19041, 9645, -28558, -7248, 29374, - 18860, -6684, -16528, -7225, 30025, 19035, 7793, -8614, 31292, 21499, -18165, -7158, -17599, -21354, - 8717, 4133, 11213, 19373, -26459, -10156, -30673, -3436, 4725, 4789, 4341, 25969, -18666, 2733, - -2064, 28958, -13955, 9923, -12677, -12424, -24287, 13172, -17893, -31970, -22671, -4392, -19117, 26150, - -12523, -19623, 26071, 18153, -17801, 19244, 810, -25772, 26155, -4794, 2087, -1944, 29520, 29385, - 15751, -16585, -15915, -880, 4797, 12762, 19702, -12437, 14112, -24713, -28402, 16345, -6377, 4443, - -23799, 22242, 175, -6932, -20928, 3920, 18983, -31612, 19582, -25546, -5233, -12814, -8853, 4846, - -10379, 21372, 12095, 13914, -28837, 23239, -730, -24276, 2761, -3316, 23271, -23302, 21291, 15209, - 31112, 4978, -14018, 21167, 11509, 2462, 2294, -3891, 8591, -26267, -11708, -17022, -14592, 8653, - 26714, 29140, -26393, 30672, -22025, 17300, -1508, -2180, 8309, -32712, -3033, -24859, -11547, 32006, - 27792, 29509, 22260, -4350, -708, -27052, 27441, 7369, 21048, 31930, 32667, 7204, 12123, 18142, - -13577, 18252, 5607, -7037, -16751, 32110, -213, 24530, -32750, -31241, -2151, 27048, 16093, 30941, - -29988, -2119, 27875, 30103, -3270, -10892, 3754, -11553, -26941, -32620, -18030, -31756, 13651, 24838, - 11845, 22419, -29808, 6589, 27519, 28192, 16375, 10961, -28118, -27424, -31494, 11739, 26806, 25657, - 31613, 23444, 7837, 340, 25656, 7925, -25375, 29631, 22120, 15798, -9150, 466, 2512, 5040, - 23475, -26675, -23908, -980, -8859, -5914, 22271, -10579, 17484, 10057, -9067, 7598, -26227, 3773, - 28497, 8331, -6769, 16009, 9759, 28131, -8800, 13800, 3821, 28370, 29311, -11715, -5615, -18879, - -27133, 3999, 17933, 10878, 21331, 5345, -6921, -19658, 5489, 15691, 8680, 7115, -29089, 17554, - 15109, -9891, -26156, 20071, 8124, 2892, 3028, 11945, -1322, 7341, 27257, -16914, -6094, -20157, - -24596, -28989, 28930, 16120, 6273, 13853, -30558, 13295, -24049, -4687, -19828, -21892, 15140, -16764, - -4923, -6846, -6268, 14548, -10619, -30032, 25414, 1056, -20246, -24043, -2119, 21055, -21516, 7371, - 16002, 2721, 30566, 20855, -25289, 25542, 2473, 17898, -29100, 9665, -1163, -23067, 15429, 1373, - -9071, 10876, -28570, -3413, -28396, -16510, -521, -17866, 529, 26374, 29628, -17286, -23997, -15797, - 14199, 2906, -21756, -29690, -18281, 10177, 20407, -17591, 13851, -22788, 10089, 29566, 22880, -18770, - 25747, -25901, 20064, 29261, 2427, -28043, 27454, 28101, -1454, -24721, 19575, 19529, -18539, 24009, - -16097, -6925, 2630, 23991, -7300, -25196, -13855, -29090, 26498, -252, 7392, -637, 510, 28878, - -12772, 7961, 9114, 8036, -13767, -16193, -24575, -24490, 25145, 28232, -5820, 27806, -24191, -522, - 13431, 26477, -13380, 6551, 7200, -29787, 19876, 12661, -19763, -18864, 25917, 21057, -9593, -12085, - -2660, 10426, -20964, 15438, -30779, -9345, 1333, -15051, -25664, 32718, -22152, -10436, 18099, 23758, - -14921, 11303, -8971, 28077, -1706, -6929, 28621, 27484, -6301, 19038, -25598, 11999, 28982, -9577, - 12783, 11082, -16200, -5717, -12883, -26040, 32575, -6893, 6238, 13362, -23755, -3081, -11314, 31094, - 28851, -19942, -8543, 25018, -12380, -29827, -1604, 32176, 2597, 2942, -7501, -25309, -7123, 29441, - -28363, 19801, -29408, -1650, 8285, -31612, 13268, 15222, 26622, -2158, -21737, 18859, 3641, -2521, - 13087, 24291, -22738, 1115, 361, -29288, -32674, -19036, 260, -32762, -770, 26326, -5158, 25481, - -9030, -13736, 12516, 3113, -21311, 3891, -15205, -14642, 6168, 25645, 11148, 16793, -16577, -18636, - 31736, -22258, 5352, 5488, -15799, 23448, 29179, -3867, 27936, -27596, -7575, -25008, -13628, 14476, - -19979, 11253, -16552, -18810, -32313, 16872, -14039, -2912, -30797, 1496, -27722, 1331, 9058, -16740, - -17774, -13575, -16662, 11014, -32648, 24492, 28892, 6738, 11288, -9918, -8542, -6247, -18494, -27774, - -16493, 8724, 13114, -21176, -77, 7689, 16852, 25638, -7739, -31999, -5294, -105, -21596, -7127, - 16652, -5824, -30090, 28877, 486, 1871, 19189, -30711, 8772, 13225, 20350, 24285, 16316, 27898, - -1868, 15944, -25101, 2151, 15508, -29304, -32270, -29109, -2678, 8885, 27947, -10460, -6018, -9739, - -28371, 4326, 9405, 5108, 6856, -10167, -26260, -7943, -15191, -32481, -26079, -9491, -21057, 13897, - 24806, 21016, -18860, -16174, 27318, -17762, 9596, 30157, 7268, -6119, -11048, 5984, 18802, 24923, - 28657, -2606, -14217, 4101, -6136, 6791, -24269, -24828, -29607, -21981, 11102, -32198, 17421, -3088, - 8637, -31098, -6613, 6494, 2462, -12225, 11339, -28355, -24561, 18306, 26457, -12409, -4140, -26867, - 15553, 6483, -21683, -31612, 24875, -9787, 20951, -29322, 21061, -27436, 7409, 4087, -29313, 2469, - 23731, 4254, -17161, -25567, 10667, 8353, -32720, -16141, -1000, 32485, 18513, -2902, -7870, -11816, - 30455, 28048, -11365, -7671, 23528, 22773, -25666, 30706, -18097, -32754, -15222, -23204, -3615, -16586, - -28486, 31111, 3553, 21301, -17379, -22064, 27560, 10289, 20849, -2449, 17938, 19513, 20712, -13205, - 329, 4644, -3056, -25718, -21529, 21927, -26466, 24467, -3030, 16151, 15314, -23574, 4003, 8599, - 9687, 26953, -30187, 6929, -1646, 31348, -30504, -23226, -17548, -31633, -15037, -12291, 16531, -27693, - 26074, -11828, -11411, -29446, -19235, 2765, 32018, 664, 14898, -17098, 31341, 3950, 7008, 14532, - 1548, -18236, 10484, -11192, 16999, 4688, 24953, -26852, 28738, 19758, -18745, -11521, -27210, 19992, - -18735, -4192, 9030, 10821, -2150, 31957, 27826, 6973, -31777, 9913, 15386, -26088, 17512, 15976, - -27916, -15333, 28635, -1871, 29505, -10627, -24235, 10493, -4854, -5487, -14202, -4633, -306, 11901, - 21760, 5765, 18835, -22934, -1070, -25526, 18532, -22601, -18050, 17227, -5001, -15187, 27560, 26201, - -16629, 29366, 24745, 23020, 17897, -32561, -8896, -23935, -4021, 7411, -1406, 6716, -25432, 17251, - -7277, 31062, 15197, 22966, -4643, -19486, -2336, -10544, 273, -30430, -6610, 31497, -2160, -6081, - -21933, 11545, -23387, -19797, 15040, -1953, 15709, 5370, 15541, -24365, 8711, 1691, 27870, 12827, - -9864, 4501, -21971, -17894, -26942, 27455, -10346, -30722, 16879, -18713, -20885, 12884, 20710, 14995, - -12598, 7394, -4372, -22200, 12401, 27211, 17544, 31449, 16713, -22219, 24304, -24316, 8083, -25729, - 13999, 30286, 12268, 1772, -5458, 23302, 24472, 20642, -3025, 13809, 20520, -13039, -8947, -16337, - 23909, -13110, 32126, 4261, -17288, 28398, 5329, -5810, -619, 7108, 12058, -19793, 22123, 24974, - 8574, -31077, 14361, -5754, 3870, -17749, -28403, -16838, 14543, -11967, -26876, 25003, 20680, -6381, - -2804, -30834, -8909, 31242, -21757, 27531, -5119, -30210, 19733, 26833, 24759, -15524, 8330, -3004, - -5703, 31030, 22116, -31322, 109, -1065, 22755, -16736, 5547, 29277, -8505, -13021, -26006, -873, - -11918, -30062, -23969, -17176, 8320, -9854, 19745, -32482, -22666, -19007, 29289, -7804, -5935, 4765, - -7103, 9962, 30207, -31991, -12430, -27403, -27987, 23590, 24325, -26635, -24586, 4670, -664, -11981, - 7501, 20900, 1209, 24140, -22821, -9915, 10724, -22831, -28451, 13862, -29778, -5181, -19194, 10817, - 28850, -13791, 4449, -21829, -2894, -24923, 27027, -2242, 29100, 19667, -7667, 21104, 10963, 14536, - -23253, 213, -19154, -1471, 13952, 19715, -4419, 31556, -6115, 22906, 21936, -32552, 18622, -9286, - -4256, 7068, -3255, 20774, 16239, -8803, -28102, -20062, -20835, -9515, 19903, -15942, 24648, 8466, - 15547, 3839, -10012, 9094, 9489, -13231, 15387, 8539, 29122, -8013, -26908, 13330, 31580, 9062, - -24335, 710, 9286, 13768, -26616, -14847, 20496, 12807, 11155, 18355, 12404, 19352, 28632, -7319, - 30004, -9913, -5203, -22805, -18806, -24052, 23187, -31434, 31550, 24869, -3766, -10576, -16281, 11185, - 21688, 18321, 28942, -3280, -2092, -9795, 11582, 27631, 8312, 2540, 17347, 2171, 10111, 17054, - 21349, -28931, 22644, -3214, -11516, 25342, 22311, 5827, -11836, -32420, -16848, -2451, -28797, 23876, - -31731, -1331, 29344, 27207, -354, 26220, 3600, -28711, -6325, 9569, 21633, 8159, -11562, -3847, - 9755, -9528, 2989, -5423, -5929, -17374, -5315, 1930, 1353, -7333, 7366, 6653, 9328, 29676, - 32148, 17124, 27300, -9694, -17165, 6260, -21147, -29975, -4079, 7211, 16027, 13653, 204, 16217, - 26046, 25872, 148, -23820, -24210, -32443, -147, 23471, 5786, -20274, 22821, -28938, -30126, 6708, - -26084, 22555, 26976, 27001, -14539, 7882, 29008, 25542, -9485, -13355, 6336, -30972, -10322, 13229, - 6405, -7574, 30906, -832, -10970, -968, 29062, 17061, 23988, -24230, 7349, 43, 12797, -24617, - -16400, 692, -6915, 24954, 25879, 12427, -16706, -10156, 28429, 2170, -6409, 13489, 702, -11514, - -16406, -23830, -32178, 18785, 21873, -1973, 29772, -30724, -7922, -4989, 23598, -19793, -11436, -28420, - -1108, -24074, -16793, 19524, -16856, -1800, -26024, -7692, -18349, -28963, 3, 20685, -16309, 31074, - 13895, -18440, -14754, 14989, 894, -12906, -12394, -9259, 21608, 7534, -27690, 26805, -17301, 31212, - 1451, 896, 12521, -13754, 744, -2732, 13834, -23948, 13562, -25367, 24471, 32430, -23513, -33, - 7455, 10996, -21720, 26195, 25405, -25162, 4774, 14764, -19954, 30554, 15215, -16813, -27545, 29113, - -23652, -13664, 14188, 23607, -12456, 20973, -6200, -14409, -32033, -14255, 2762, 4517, -32088, -1938, - -17773, 27027, 21725, -23188, -28880, -14371, -8442, 22861, 24252, 13936, -18039, 20411, 1131, 12375, - 21098, -11396, -24347, 17142, -16825, 10111, 13517, 23520, -9699, -6159, -5113, -4901, -13769, 30999, - 15946, 20998, 9224, 13189, -7235, -12364, 16524, -24811, 32697, 23032, 22134, 6520, -9219, 14359, - -21784, -23251, -32101, -13805, -9652, -24765, -11805, -400, 20784, 3649, 30386, -14172, -21017, -21325, - 11744, -19639, -1265, -238, -13180, -28380, 9472, 6699, 8795, -29706, 20111, 5405, -14433, -12986, - -9961, 12045, 14978, -2829, 13754, 5162, -17253, 16994, 13701, -12118, 4534, -31239, -25183, -22893, - 16861, 13345, 22741, -7628, 682, 23594, -3895, 22899, 28855, -21727, 28286, -1898, -7434, 160, - -24794, -28018, -7888, -7696, 11154, -22546, 26587, -18456, -14588, -18309, 18019, 32164, 4507, -7414, - 25321, 19786, -16299, 16843, 32406, -15274, 32561, -12274, 28873, 29802, -8659, -7462, -26550, -22657, - 30489, 6547, 6120, -15867, -24784, -25101, -9836, 17762, 13848, 29605, -32130, 16316, 9638, -14076, - -4141, 25509, 8584, 15706, 16406, 18955, 32548, -3364, -20198, 29120, -8630, -15842, 417, 17087, - 15818, -23448, 11938, 26258, 4032, -25235, -4706, -3565, 3892, 31908, -32300, -2632, 2966, 26356, - 10775, -24001, -8010, 18015, -6909, 27893, 13193, 14770, -7858, -3200, -13019, -14090, -4125, 11640, - -13713, 25457, 14605, -13699, -19653, 28509, 15223, -27122, -29258, 5379, 7430, -2851, 16881, 18888, - -22292, -32766, -23589, -13566, 10366, -3642, 12673, 1918, 25870, -24634, 20743, -9802, -32321, 4317, - -7535, 29519, -1219, 12026, -27494, 12655, -10349, 17642, -22587, 25078, 12545, -14131, 15740, 20920, - 11452, 20326, 16679, -15144, -23927, 5503, 23489, -29340, 17576, 21207, -5368, -31410, -3434, 7230, - 30643, 4354, 8840, -29685, 27310, 1472, 6795, -16015, 30864, -2058, -12170, -12586, 7902, 19063, - 12163, -21764, -6810, -28157, -7105, -3084, 5431, -12086, 22948, 7395, -10361, -32419, 11469, 25143, - -13087, 19011, -8472, -15068, -25234, -21859, -19527, 31500, 5771, 15357, -434, -28376, 23690, -31210, - 11392, -24177, -28486, 32703, -11494, 9828, -1959, -6440, -7865, -30976, 5594, 4208, 26331, -3636, - -11140, 21207, 26360, -17994, 871, 2632, -16101, -11592, -1664, -29196, 17133, -18835, 12170, -23417, - -8896, 15161, 30670, 15507, -31558, 29448, 2283, 14890, -10392, 15205, -16187, -279, 19264, 21278, - 1955, 25149, 25768, 7086, 26398, -29075, 24061, -24646, -25792, 1444, 3326, -24301, 4179, 2534, - 3255, 11474, 30371, -8972, 31326, 2880, 27510, -23034, 25850, 10309, -22010, 16709, 31723, -10957, - 19159, -2915, 32385, -2269, 5846, -15698, 14668, -7369, -13326, 2503, -17410, -13486, 5663, -9499, - -25365, -19842, 27077, 27281, 17868, -1317, -5711, -30101, 22159, -10925, -10133, -25833, -32750, -18457, - -18107, 3001, 8942, 117, 2863, -14933, -13983, -5263, -29166, -13603, 8110, -21303, 13865, 23826, - 11416, -7793, 25871, -18149, -24418, 30826, 21467, 13190, 11496, -3817, -16132, -11261, 20038, 16904, - 28808, 4281, -4220, -31996, -4961, 20446, 5287, -25554, 22393, 23633, 24403, 4706, -11788, -7190, - 14886, 31407, -23465, 21776, 18167, 22757, -20400, 26031, 32390, 31856, 3576, -12516, 17156, 10629, - -3918, 23755, 28538, 22405, 25184, -20740, -28106, -15602, -26769, 755, -24708, -7259, 27349, 9707, - 21673, 31600, 19622, -15275, -23797, 31326, -22315, 32471, 22542, 30749, 24059, 6870, 4469, -20768, - -4883, -32153, -24755, -8585, -1510, -26059, -11153, -12061, -6672, -17683, -18298, -28429, 31324, -24539, - 31052, -21762, -15559, 14467, -1909, 8126, -4183, -8783, -1199, 6854, -13348, 17559, 14689, -12892, - -29345, 19347, 5858, -709, 8643, 12796, -9749, 16968, 20629, 6645, -21455, 31714, -14553, 15935, - -1039, 20731, 16276, -16213, -2714, 1778, 7515, -5571, 16582, -17133, -4039, 17884, -16452, 14193, - -4853, -22996, 8004, -2235, -21896, 15092, -15921, 30644, -28152, 10376, 17977, -32258, -31880, -23752, - 24131, -26072, 27856, 1344, 9336, -17001, 8094, 26966, 31329, -30177, 17720, 31222, 4406, -18017, - -6579, 3395, -9271, 10562, 26473, -25646, -23358, -8123, -17997, 21484, -12411, -29374, -20061, 10434, - 29834, 352, 10292, -27693, -16905, 1540, 17246, 4120, -9113, -30194, 24265, -12928, -16507, -25007, - -5404, -26342, -14594, 12819, -30499, 28561, 18371, 12498, -6837, 18312, -8866, 11362, -12561, 4622, - -187, 12547, -20560, -27582, 16196, -4331, -30707, -31000, -31110, -13138, 16045, 29726, -839, -11821, - 24610, -21213, 20620, 18963, -21130, -14925, 4688, -20014, -11352, 14660, 2326, -4022, -12546, -25743, - -4471, 29204, 27479, 12135, -14369, 14017, 15578, 4049, 3551, -5122, 10452, -1870, 13132, 30799, - -4759, -11826, -22603, -25165, -22002, -13513, 27492, 19058, 9697, 8954, -3467, 193, 10352, 2048, - -10399, 22328, -28658, -11685, -12664, -31960, -26802, -23520, -25101, -27586, 29727, 23290, -8258, 8922, - -11703, 26296, 1226, -11929, -14920, -30559, -15715, -3558, 267, 30958, 19688, 28380, 26274, 4027, - 29199, 9730, 11754, 19481, -23740, -29050, -28954, 18701, 32512, 11960, -28923, -5713, 7165, 32103, - 21356, 28465, -1466, 21626, 25467, 19882, -9960, -528, -10150, -28044, -29748, -28252, 10955, 2369, - 7879, -3226, 28376, -29634, 15456, 4205, 5852, 16966, -21448, 23231, -32430, -27983, -18430, -5485, - 29361, 25338, -31555, 26654, 864, -18890, -27810, 27168, -16155, -12683, -16487, 26341, -28363, 10787, - 13059, 28570, 9464, 11025, 15458, 28063, -30128, -24020, -14339, -4307, -6671, 23996, -25503, 22454, - -1018, 25115, -25563, 25098, -793, -546, -342, 30690, -27368, 22752, -11767, 15603, -15853, -30091, - -8658, -13671, -12607, 16512, 5932, 24117, -1865, -21760, -5591, -8244, -5272, 17703, 10724, -18353, - -24335, 32107, -32102, -16706, 2197, -11531, -2219, -14062, 26880, -4424, 8239, 9979, 23469, 2746, - -19493, 13847, 23463, -10550, -21154, 13774, 20582, -7586, -31229, 14771, -24841, -2418, 11783, 16161, - 175, -25953, -4742, -20460, 18681, -3699, 14631, 13994, 19036, 26538, -24714, -11024, 31463, -9840, - -4054, 21876, 3198, 26613, -29154, -4018, -1671, 10974, 28100, 13462, -20843, 24528, -15936, -13563, - -1905, -31243, 13922, 30362, 7354, -5287, -11882, 498, 28136, -30804, -26847, -11846, 18836, -10571, - -8662, -8744, -21404, -22530, 9546, -6429, 27401, 21788, -25863, -29280, -4439, -30697, -1442, 13344, - 31030, -25809, -16989, -4069, -31942, 28673, 16125, 634, -2817, -12235, 20714, 14271, 8685, -3173, - 13209, -11886, -27231, 13768, 30066, 8, -427, -26669, -2067, -3386, 16859, -29982, -489, 18130, - -26432, 908, 1310, 10655, -26934, -10449, -8582, -2062, -32012, -4668, 3979, -30778, -8749, -26676, - -11439, 13690, -2235, -11394, -18167, -7342, 23306, 15878, 13741, -16400, -12770, -10256, 25715, -14623, - -11486, 1693, 17699, 17392, 20449, -11771, 31500, 4756, -30075, -12926, 4875, -29787, 22752, -32065, - -31422, -26421, 23663, -10498, 20542, 10202, 4133, 3864, -1395, -26279, -25451, -16857, 24659, 3663, - -17650, 32374, 22511, 21365, -16467, 13560, 30415, -19847, 22550, -18184, -13693, 26439, 25800, 2328, - -5231, -18071, 30105, -22940, 26824, -14549, 31524, 17004, 6677, 24798, 30259, 2832, 14136, 19838, - 4558, -15650, 28675, 10007, -21706, -18771, 577, -7009, -18453, -5336, -12519, -28319, -24534, -1985, - -31341, 16677, 476, 28336, -24905, -11401, -1571, -30902, 11050, -15273, 18314, -29299, -370, 6167, - -17755, 10415, -27013, -16605, 29248, 15263, -13509, 29843, -21340, 8853, 28490, -23638, 4072, -19356, - -32446, -2234, -1051, -23636, 12510, 25938, 29231, 31834, -4709, -10939, 12033, -461, 302, 19810, - -5834, -15228, -26947, -5515, 7568, -19281, 1279, -10604, -22997, -8343, 30225, 60, 4269, -20862, - 1352, -27306, -23412, -21215, 12539, 20621, 2984, 29689, 17041, 22940, -10387, -3189, -14495, 27678, - 12194, -29373, -16195, 3059, -2266, -1549, 28173, -32630, 5247, -1812, -17675, 2909, -3211, 21991, - -29227, -24891, -6359, -21476, 8462, -25564, 19277, -7073, -17328, 12092, -24095, -25335, 9166, -26594, - 21219, 4963, 18237, 5628, 24467, -24342, 4380, -17028, 26341, -15934, -32507, -21871, 22046, -24640, - 13920, 27570, -15059, 6303, 14016, 11750, 7529, -11159, 31305, -19483, -12596, -19606, -25333, -6189, - -2824, 3616, -12573, -21774, -18256, 13437, -3760, -18062, -16820, -31093, -14725, -7035, -8721, -27558, - 26249, -28108, 6672, -9738, -24292, 15586, -19294, 603, 164, 28831, -25791, -425, -18667, -10280, - -10306, 8502, 4215, -10412, -9949, -31857, -6045, -5825, -10550, 6290, 7315, 886, -24118, 10851, - 15527, 4416, 23761, -6415, -28975, 1440, 9787, -26620, 25372, 2884, 22544, 13398, 18563, -9521, - -31917, 12278, 10650, 27777, 11798, -3072, 24362, 24799, -19081, -6266, 22119, 17689, -19748, 28852, - 25939, 9124, -991, -24188, -29245, -30059, -25358, -24639, 17250, 22356, 21720, 31593, -834, -32062, - 19054, -12740, -27144, -24173, -26108, 3050, 6749, 22141, 6082, -452, 12780, 25247, -24667, 17046, - 3537, 9649, -20533, 8678, -14043, 26309, -23649, -3914, 16501, 1292, -6615, -4633, 6929, 31984, - 28308, 32681, 21213, -22711, 25281, 2290, 13440, 25361, 6236, -30406, -31761, 32753, -19945, 6488, - 4615, -2565, 3986, -10087, 9357, 27813, 28578, 9884, 2234, -3950, 32478, 15700, 19272, -26169, - 30612, 23925, -16516, -23160, -625, 9929, 22118, -25799, 24137, 24483, 18677, 22776, 6223, 31650, - -16376, 4032, -4054, -28108, -349, -5428, -23552, 13852, -21311, -23456, -26736, -12577, -810, -18534, - -10166, 9230, 28025, -18598, -14088, 22790, 30362, -5973, -25788, -4274, 32300, 17547, -10635, -2142, - -5485, -25818, 17885, 10388, -29688, 11389, -7122, 20010, -31329, 19984, -13665, -2581, 22584, -9733, - 24223, 1082, 5546, -11626, -24266, -27988, -26989, -17744, 9498, -469, 11641, -12540, -5369, 2733, - 11799, -4030, -14972, 22487, -30418, 13676, -31575, -28520, 23891, 7151, 21544, 21923, 21541, -1527, - 28827, -6627, -19364, -7841, -26964, 8554, 27524, 7097, -17265, 28729, -3040, -9605, 7406, 20877, - -10820, -5560, -29220, 9648, -2518, 9649, -32077, 13975, -19788, 20385, -24157, -17874, -23418, 13109, - -17114, 6587, 12982, -15308, -11645, 2826, -2179, -28192, -6120, 13937, -1637, 6132, -28361, 32574, - -10224, -20957, -12511, 13681, -24701, -9631, -23557, -22110, -4017, 19396, -19268, 17951, -22950, -29661, - 5630, 19036, 19870, 3374, -22652, 6963, 11591, 16172, 29406, -11912, 19195, 1211, 8275, -28201, - -30326, -9150, -32225, -3000, 14266, -17050, -10103, -11707, 31198, 30799, 17358, 8840, -4033, 8772, - 25288, -6383, -28709, -17937, 477, 19343, 16371, 32143, -9136, -4108, -9866, 6533, -28512, 31239, - 2241, 26871, 27915, 11284, 13150, -29010, -15268, -18478, 6980, 17469, -13106, 21325, -2681, -11604, - 12483, 22161, 31400, -26852, 16186, -19560, 630, -27201, -22228, -27123, -296, -13502, 4374, 12336, - -24561, 26351, 17327, -18796, 22408, -28241, 3788, 25864, -19990, 13793, 14321, 30226, 10872, 15538, - 5605, 17989, 8703, -6566, 12540, 15228, -1355, 29840, -9904, 28316, 24154, -8374, 28259, -14089, - 22964, -20044, 14429, -8523, -6141, 3110, 5365, -9560, 20793, -18929, -14933, 18092, 11521, 22912, - -4585, -35, -4436, 20796, 17243, 32523, -26894, -27116, 11336, -9117, 24124, 4275, 979, 14150, - 12054, 7658, -9591, 24969, 18790, 19167, -28992, -20146, 14628, 10103, -14117, 4019, 14195, 25112, - 18543, -17336, -9079, -31418, -11524, -16712, 24452, 22088, -5348, 30749, -23623, -8490, 31265, -15985, - 11118, 17514, 2439, 13465, 26181, -21117, -11034, -246, -2460, -12205, 11833, -13289, -2691, -13927, - -28892, 7953, 12291, 11901, -6926, -13970, 25004, -4838, 27127, 5179, 19608, 30929, -27725, -8725, - 11595, 31796, 6222, -5014, 32045, 21695, -4569, 422, -4140, -28054, -4824, -11643, 30040, -10167, - 7184, -17022, 952, -22156, 31595, -22585, -14233, -1093, -4865, -4607, -25975, -16708, 7906, 5558, - -23374, -11619, -7796, 12307, -28582, 26894, 30041, -347, 29538, -29995, -19643, 12096, 1237, 6036, - -28332, 28557, -5376, 8490, 26787, -23869, 6887, -29633, 4933, 403, 23097, 14211, 18977, -9365, - -20111, 23629, 32495, 14504, 2404, -5159, -9900, 32528, 23829, 30484, -17146, 29703, 16475, -5371, - 26444, -2606, -29224, 29605, 4166, 22461, 20301, 25517, 19392, 15571, 19191, 22066, -29298, 21309, - 13437, -27214, 5076, 20167, 6689, 30561, -13651, -19951, -18519, 30460, -13182, -27673, -5619, 23716, - -15434, 9989, 30497, -28482, -4582, 10992, -9861, 29860, 17144, -28584, -16649, -26006, 15141, 16834, - 21184, -18363, -28928, 11477, -10909, -22379, 3660, 1804, -23282, -30518, 22134, 26094, -4773, 2740, - -26319, 12875, -31136, -13639, 11968, 1491, 22322, -3475, 28132, -32457, -21024, -9634, -21229, 27003, - -18327, -4383, 26729, 2406, -4941, -14334, -13280, -28434, 12815, -6286, 12038, 26739, 24780, 5419, - 16994, 22678, -24592, 23226, -32169, -19826, -24592, 7498, 14056, 18175, 19003, 12178, 30193, -10542, - 6581, -19584, -14966, 17938, 751, 24061, -11967, 19013, -12066, -989, -25752, 25794, 27267, -26961, - -24882, -24662, -13422, -3436, -8088, -7481, -16493, 8129, 1556, -24171, -25412, 7142, -6187, 23498, - -12549, -1980, -787, 20999, 5777, -29561, -14320, 32107, -23823, 20225, -32707, 14963, 22746, 9981, - 15606, 21580, 17280, 15577, -26959, -5490, 19941, -6947, -22368, -2015, 1123, -4205, 23535, 19190, - -26466, -25615, 6405, -29248, -30191, 27541, 25418, 8000, -26994, -2799, 22962, 2672, -5408, 17462, - 16175, 10732, -9615, -21399, -13460, 17357, -21076, 11291, 25071, 29971, -14705, 30867, -20082, -19711, - -16153, 19907, -211, -7626, 14408, -14970, 2547, 15846, -11734, 26269, -3410, -19005, 20220, -8831, - 8336, 21047, -25526, -24650, -16189, 9510, 24794, 2775, -25629, -5491, 7145, 28068, 25891, 32509, - -22455, -5223, -850, 11899, 10880, -27904, 28204, -9749, -24801, 1906, 26866, -12913, 20511, 9763, - -13029, 11153, 5311, 573, 24869, -14803, 20713, -8282, 5145, 31541, -18269, 9994, -29281, -7157, - -28134, 19746, 1832, 8775, -26749, 28711, 13994, -741, 17553, 22329, 21437, 11197, 24059, -12516, - -16532, -11469, 460, -8513, -21897, 22407, -9388, 1188, 9641, 17062, 10637, 26436, 1351, 21066, - -32628, 30636, -14874, -5160, -14131, -16869, -25062, -23400, -115, 1269, -23397, 14052, -32583, 29274, - 5448, 9706, -8210, 17980, -7843, 10201, 31792, -26702, -19423, 1612, -6290, 20160, 15930, 19511, - -16658, -5089, -18930, 17766, -6131, 3061, -21988, -31794, 20896, 11924, -14867, -21108, -23771, 22469, - 11979, -18441, -9237, 7334, -1569, -21721, 16901, -1091, 31723, -19976, 16034, -7755, -31487, -6646, - 6996, 14706, 19922, 22615, 14898, 1434, 10712, -28903, 32704, 14897, 2120, 8218, -25760, -6943, - -23935, -29634, -5671, 27736, -21071, 4440, -24176, -22135, 3047, -3114, 5553, -19945, -10422, 4640, - 7462, 25499, 11005, 9077, 24840, -725, -27792, -32719, 23901, -6175, 31762, -16663, -10836, 11482, - -31644, 24967, -23692, 8051, -15098, 7049, 1335, 20343, 4452, -3168, -21433, -23521, -28891, -20041, - -17456, -24122, -12780, -27267, -20072, -9967, 21563, 7901, -15288, -22542, 9349, -3627, -28029, -16277, - 9352, -29088, 21489, 30857, 28132, -16366, 26135, 19625, -10106, 15867, 10363, 15350, 17907, 17132, - 538, 5084, -21368, 13213, -25691, -6591, 30657, 3512, 22271, 22189, 16615, 4824, 23227, 3794, - -6100, 4593, -2504, -16130, -20737, -14769, -22245, 2680, 2459, 25398, 13477, 7151, 7584, -20, - -1134, -31778, -22065, 1568, -18937, -9886, 29011, -6375, 26935, 6741, 28748, -16147, 9266, -1898, - -30341, -3534, 8211, 1167, 19650, -24358, -27934, 24141, -3826, -14162, -24359, 14235, -16244, -8189, - -20506, -21544, -20332, 3761, -8115, 11537, -242, -27423, -7281, -14062, -23074, 15367, 22682, -27688, - -28790, -15022, -4348, 24771, 31035, -20220, 18493, 24762, 9185, 27134, -27021, 1473, 6923, 32245, - 10831, -1903, -2837, -18312, 5600, 5465, -21392, -3534, 29593, -13393, 21464, -5510, -539, -3123, - 25521, -28948, 5344, -28047, 11129, 19429, -12931, 14244, -4487, 26285, -6782, 4820, -20385, 21197, - 20065, 30346, -17762, -11578, -20164, 1759, 13128, -24607, -23838, 21577, 19883, -30973, 23749, -4786, - 18334, 29802, -28476, -26125, -3890, 30261, 32627, -24184, -28121, 9488, -18653, 7630, -15792, -7534, - -27270, -7543, -22576, -26217, 15412, 22226, -24981, -22036, -18380, -28028, -6175, -2612, 685, 12343, - -16357, -3892, -31768, -24103, 18691, -14658, -30425, -18996, 13071, -16293, 23787, -23171, 9261, 17302, - 18095, 13130, -28163, 22864, -16268, 24829, -15882, -10375, 3936, -14845, 2668, 25975, -2613, 7589, - -32761, -19648, -22042, -30162, 5244, 15474, -25747, -16760, -4950, -20851, -2890, -11381, 14818, 2489, - 10913, -25242, -1443, 18657, 7362, 31773, -31390, -22896, -12316, -9537, -3539, -25723, -25350, 20224, - -4805, -1354, 12548, -31157, 14305, -4796, -2975, -631, 10363, 7949, 16899, 6382, 1484, -17252, - 30446, 29834, 26824, 5843, -21706, 770, 9582, 21651, -16820, 4502, 29540, 26959, -4765, 363, - 14122, 11212, 20808, 13243, -22876, -5588, -8512, 10022, -8007, 30288, -12330, 10389, -29121, 15057, - 18959, -23530, -4996, 9536, 3837, -19161, 9518, -2319, 1492, -32557, -15245, -31963, 25970, 8426, - 6944, 9821, 9837, -26636, 11757, 18191, -4359, 7970, -31350, 15984, -14027, -27250, 29048, -2633, - -8292, -31384, -31597, -8296, -24044, -21151, 18426, 12781, -16808, 934, -21193, 30446, 5928, -11648, - -30245, 5585, 27549, -15259, 21955, -9503, 19797, 13257, -10968, -6577, 21179, -31487, -12382, 10008, - 11352, -15322, 26680, -14145, 25397, -7664, 16195, -9939, -13674, 5413, 19877, -25896, -27647, -18255, - -13667, -29063, 787, -28751, -1670, 10736, -22093, 14459, -19674, 17238, -574, 30548, 6187, -6796, - -31900, -20342, 28906, -26999, -13071, -3882, -24693, -10363, 18675, -15412, -14553, 31630, 20382, 22033, - 28980, -31654, -19859, -8413, 11928, -5251, -8375, -26339, -20974, 22722, 20109, 9674, 4946, -26645, - -16776, 29735, 566, 434, 32672, -21727, 28291, -26541, -18939, 17703, 28449, 11490, 23793, -28702, - -29226, -31601, -22513, 16046, 27105, -7956, 2232, 13998, -7790, 21930, 1497, -24904, 11612, -29955, - 2533, 22249, 12823, -17860, 2813, 17784, -1115, 10082, -30416, 5777, 8457, 13972, 18808, -15649, - -4419, -6407, -3994, 28546, -6385, -15764, 17782, 18765, 24049, 12178, -17396, 17309, -16062, -21680, - 8266, -32179, -8969, 22654, -1958, -12483, -4898, 4843, 17598, -12183, -25810, -31492, -11763, 28290, - 15246, -7875, 9053, 27921, -23005, -31229, 23886, -10400, -32001, 14747, 13197, -12273, -29798, -4818, - 5134, 24972, 24974, -1065, -19985, 1791, 5699, 6214, -21967, 19024, -2627, -17475, 885, -30144, - 12785, -15970, 31109, 29344, -19391, -17998, 15693, 17997, -32210, -3946, -8069, 18033, -7191, -11758, - 25115, -16474, -28175, -15232, -4082, -20743, 22646, -11980, 2154, 6576, 21218, -18296, -2751, 16461, - -733, 21243, 3196, 16327, 32705, 7853, 10609, -6441, -26977, 11918, -18090, 14457, -18166, 17306, - -17312, 1286, 13245, -10741, 3656, -1991, -12697, -18780, 32346, 14958, 5425, -29371, -1397, -25919, - 13024, -22333, 27045, -24855, 16986, 8566, -10859, 3344, 20280, 4995, -11505, 21622, 1326, 4215, - -22984, 19146, -24505, 30551, -375, -18678, 21195, 7246, 5075, -4489, -2555, -6376, 7351, -21775, - 31110, -19283, 6272, 17344, 8043, 4434, 13129, 20560, -32068, 28145, -4707, -31705, 21724, -17745, - 28962, 16757, -1806, -27463, -26123, 21806, 16415, 30425, 27139, -13166, -19432, 6490, -1012, -15885, - 13869, 18823, -9542, -463, -31195, -5268, -6134, -24904, -22798, 2938, -16549, -749, -25077, -1722, - 27220, 1832, -19303, 30128, 30266, 15483, -27451, 16285, 10992, 5643, -23697, 31568, -5057, 350, - -28980, 15862, 29846, -12342, 5265, 28420, -22275, 2494, 11281, -20582, 14528, -790, 19478, -13066, - -11716, 11577, -19832, 2644, -27218, -8099, 19448, -1466, -13755, 18893, 14833, -15657, -596, -27682, - -10983, -14952, -2954, 23140, -23517, -23391, -1723, -16670, 23238, 2088, 29532, -15046, 14784, 15485, - -30739, -23112, 32516, -1604, -17138, 18881, 31337, -4660, 22836, -23036, 578, 24832, 1490, -9010, - -26528, -10283, -1637, -26772, 901, 31401, -32468, -26152, -15799, 32497, -17011, 9978, 2212, 29158, - 256, 27867, 16854, 19419, -24387, -5007, -26719, -16365, -30878, 19482, -6286, 3807, 18594, -17089, - 6477, -30007, 476, -13378, -7850, 29158, -17735, 20484, -11783, 20234, -7491, 11036, -4265, -5715, - -22219, 31453, 1611, 14404, -4308, 2765, 3323, -11994, 2266, 30683, -27588, 1952, -4025, -22680, - -17913, -11630, -22289, 18545, -25548, 8129, -21514, -3741, 10082, 3031, -25247, 13922, 23482, -13226, - 12832, -21159, -28423, -15307, 19755, -22487, 15241, 30810, 27593, 1515, -20144, 13889, 18912, 3945, - -5056, 20325, -5362, -28978, -26863, 17651, 7684, -15647, 21428, -21289, -17148, 15228, -19942, -13968, - -1795, 13676, -8622, 26967, -32561, -13109, 7829, 26502, -15004, 4667, -30441, 27715, 19639, -8337, - -3700, -26857, -11572, -28924, 6148, -30042, -8549, -3253, -15787, -3963, 4976, 30398, -31653, -27710, - 18347, 22240, 11943, -17600, -22007, 26258, 6636, -19034, 26725, 21375, 20072, -20916, 19271, 549, - -5355, 22809, 15809, 14991, 20253, 9029, -29202, 7707, 16408, -7273, 16915, -21012, 14000, 22628, - -18760, -4609, 7221, -15166, -8127, -32282, 32749, -20359, -28527, 8812, 13167, -31750, -16474, 21321, - -7974, 20985, 418, -224, -25734, -17311, 8193, 24739, 19729, 11916, 5599, -21653, -29462, 8509, - -3499, -29843, 31578, 13256, -31895, 14263, 18319, -22418, -22686, -1215, 14264, -23286, 16913, -14113, - 29769, -5456, -28881, 4301, -21957, 20673, 20096, -15789, -15289, 4889, -18162, 12537, -27629, 31719, - -17318, 21250, 6461, 27858, 20417, -27357, 30116, 20690, 16548, 27281, -3166, 1858, 5007, 31739, - -5948, -18471, 9899, -19278, -25533, 12489, 25797, 22950, -26786, -14528, -23887, -24563, -19359, 17268, - -26473, -10665, 3893, -12457, -11770, -1939, -4716, -2706, 8528, 31342, 29993, -18895, 4860, 5344, - -5153, -30146, 26056, -12406, 8285, 30231, 30601, -19051, -834, -8478, 29159, 15637, -31500, -17531, - -6226, 27343, -19576, 22787, 30379, 6383, 21715, 825, -18263, -31727, 5900, 13587, -13653, 28592, - -25564, 3176, 18525, -19205, 31920, -1808, -9676, -10013, 21852, 32546, 22500, -8388, 159, 19536, - -5691, -12194, -12732, 8717, -17825, -22573, -9836, 20944, 10360, 32408, -21333, 29305, -30567, -7678, - 2227, -16573, -9573, -23428, 2579, -1131, -27778, 12551, 3340, -23117, -2183, -6855, -16901, -26001, - -12085, -21840, 29510, -24398, -28283, -19088, -23189, 23717, 9748, -28561, -13352, -32711, -10452, 22246, - 6393, -17178, 26871, 23447, -18772, 19180, 20620, -12248, -21424, -9058, 2703, 22654, -31542, -16692, - -26838, 21598, -16928, -23519, -3372, 5419, -16774, -22309, 29308, 29548, 5190, 20746, -14693, 30094, - 19751, 1755, -8421, 26605, -17133, -18325, 22791, -3621, 8342, 20549, 7868, -25619, 11776, -5155, - 20187, -19558, 16385, -8304, 30941, 17386, 12276, -23802, -14544, -17146, 27048, 6373, 29562, 2156, - 29009, 6411, 10072, 10239, 14842, 28257, -29553, -25318, -24526, -23449, 8171, -2858, 12348, 4333, - -12900, -26650, -13593, 24579, 18447, 23385, -741, -32523, 10049, -31103, -3074, 3969, 25747, 23616, - 19100, -31900, -32639, 1557, 29141, 29205, 27675, 19468, -2916, -20117, 4007, 6715, 3226, -22572, - -16790, -8816, 4733, -23704, -2004, 10979, 28249, 8610, 32263, -17140, -16755, 21521, 2197, -11599, - 5548, -25841, 24458, -20991, -26241, 7401, 9834, -8332, 18734, -15867, -27326, 1587, 20663, -21445, - 17024, -4001, -9228, 27101, 5728, 31712, 20202, -20046, 30321, -10236, -9011, 10495, 2852, -22417, - 10997, 6856, 27999, 13488, 14224, 18196, -3460, 955, -2120, -32134, 19517, 25792, -22806, 32634, - -26417, 14979, 15799, 13050, -21535, -17998, 15, -30412, 17191, 18416, 1437, 16867, 21682, -19525, - -12251, 22287, -22540, 27567, 3145, 4899, -22513, 6886, -13237, -15375, 4830, 21805, -22459, 30316, - 25796, -26146, -19685, 18435, 5554, 26037, -18401, 7946, -5399, 22550, 28835, 10698, -10124, 24962, - -16168, -13085, 3534, -437, 490, 17301, 18047, 8974, -23870, -5204, -31414, -32063, 18547, 24340, - -4428, -11209, -31596, 9663, -1019, -26663, -14947, 5154, -1271, 29077, 17673, -22375, -21598, 5136, - -27142, -15661, 19327, 14411, 19486, 11621, -2811, 13340, -20580, 21618, -26808, 6934, 27790, 22088, - 24198, -26707, 8059, 21218, -25087, 6121, 6359, 15085, 32134, 32219, -26526, -21334, -29595, -15282, - -6270, -10636, -4236, 31986, 13647, 24495, -22916, 18323, -8284, 8499, 19832, 31755, -4548, -23477, - -6844, -10259, 25917, 461, 15736, -7557, 31050, -30269, 4328, 162, -1870, -1338, -5574, -24895, - -14875, 12688, 14192, -15717, -20930, -29480, 13303, -32392, -29127, -19826, -19388, 8812, 13124, -748, - 31175, 13507, -21607, 132, -5341, 11742, -5354, 29989, -14252, 6355, 20951, 20270, -8537, 5397, - 26811, -19617, 32641, -39, -4972, -4275, 8803, -17187, 272, -25724, -22074, 10495, -7372, -13842, - 28470, 24248, -24058, 8312, 20475, -5539, 5319, 26831, 4683, 21258, -32077, -24112, 12659, -11123, - 10539, -481, -26774, 26462, 27049, -2398, 14658, 12955, 17804, -31417, 3884, 8872, -10944, 20671, - -8773, 21584, 6080, -1857, 414, -19782, 17372, 26808, -14838, -26642, 12330, 1011, -24114, 10953, - 21299, 13940, -11646, 17804, 1490, -13347, -21852, -1658, -20013, -6192, 2048, 28001, -10917, -10013, - 28218, -738, 21727, -10564, 3904, -23271, 25488, -9135, 7180, -3742, 15815, 12433, -27477, 24614, - -29560, -30915, 1161, 6698, -26159, 10438, 9579, 30798, -14747, 5488, -22438, 16982, 8188, -32067, - 992, 2181, 11885, -711, -2466, -4593, -22796, -8318, -15952, -22601, 26702, -32377, 19540, -816, - -30107, 26676, 16610, 11440, 735, -19209, -30083, 23474, 32702, 14767, 240, -22277, -31480, -16489, - -23514, 10713, 13800, -20162, -11625, -12349, 9136, 26476, 25989, 15296, 22772, -1346, 28296, 30035, - -27669, -23028, -3623, 16352, -11768, 29863, -211, 27662, -20931, 17394, -26616, 14089, 5660, 3302, - -5116, -30231, -709, 23097, 4137, 9646, -26870, -10135, 28259, 26671, -29544, 28156, 26354, 12559, - 16673, 9360, 22601, 24359, -29836, 19716, -4488, 486, 10472, 21666, 22658, 16687, 2210, -21810, - 30917, -7038, -12286, -8347, 111, 24821, 20390, -15647, -8380, 5558, 21118, -9031, -15847, -15543, - 4451, -14299, 15330, -26918, -26622, -20934, 20070, -31453, -15366, 17135, -15634, -18544, 6615, -11256, - -29633, 24489, -32331, -16068, -5204, -25774, 28114, 18463, -3156, -18364, -2848, 25749, -6411, -11589, - 20704, 7041, 11829, -25341, 28114, 11373, -1433, 4515, -19509, -6935, -31501, -10918, -27691, -9737, - -31430, -16530, 7470, -14204, 3232, -1860, -28699, 7331, 27598, 30740, -9656, 31371, -14006, 16730, - 17008, 19978, -28380, -28177, -4827, -14440, -5648, -10001, -3393, -28071, -30229, -19488, -30401, 7763, - 30702, -3215, 6434, -15778, 2013, -995, 2243, 21006, -5258, 14145, 4797, -7589, -24633, 12699, - -18783, 23235, -1062, 32587, 12899, -22582, 31990, -793, -14886, -6354, 22088, -26274, -29386, -14606, - 6511, -25933, 19984, -22275, 14745, -4324, -24025, 31317, 10737, 5894, -16255, -30359, -26976, 27417, - -18947, -22709, -8850, 320, 20717, 12253, 4248, 19162, 29818, 26970, -29872, 3483, 31316, -22592, - -3619, 15574, 21718, -7259, 21936, 21938, 16432, 7841, -25204, -7233, -17957, 950, 22397, -29137, - -13853, 389, -26625, -24273, 4417, 1035, -13549, 29826, -25618, 2648, -14482, -25689, -19869, 6799, - 3972, 749, -24288, -1529, 6171, 24557, -28923, 32178, -6085, 26253, -18390, -10825, -16406, -28983, - -26316, 4057, -31602, 26805, -10362, 11122, 14968, 24717, 31348, -6283, -6088, -27514, 22783, -9331, - 32118, -4790, 15058, -2285, 28272, -10703, -20233, -26304, 24876, 13411, -27444, 2592, -27310, -8900, - -13694, -8071, -9887, -26547, -2102, -17939, -20622, -18179, -24629, 6080, -14267, -13495, -30810, 8916, - -880, 26453, 23291, -21175, -31881, 9454, 9714, -32050, -16432, 32513, -17230, 6906, -8997, 28811, - 17690, 26529, -23239, -30455, 29355, -26842, -1670, 3490, 4782, 7291, 10265, -7899, -29922, -31811, - 29002, -23503, 12643, -7114, 1674, 671, 23880, -18490, -6026, 10434, 21382, 8148, 2704, 28338, - -31762, 31961, 25491, 21977, -17930, -25336, -12680, 26181, 27636, 25848, -16669, -22981, 24334, -21938, - -11071, -23653, 18771, -7284, 1187, 3141, -31018, -15562, -2519, -6991, -25547, -6651, -7414, -3065, - 28804, 2137, -20098, -7612, 6659, -15156, -30216, 15495, 32077, -18614, -8204, 2502, 6785, -21780, - -25660, 15289, -24879, 13509, 5156, 1439, 6701, 949, -16879, 9015, -19769, 14107, -25657, -30781, - 31232, -16280, -26806, -6923, -14110, 28861, 8984, 31742, 8667, 24810, -29512, -11278, 26703, -17496, - 31746, 24944, 17088, 6308, -6814, 18209, 3427, 15682, -24337, 25249, -12770, -30119, 21697, 30274, - 2392, 2861, -31346, -1260, -26590, -1807, -14141, -23322, -11081, -8519, -10316, -4433, 21885, 17243, - 21051, -30465, 4093, -18955, -30153, 10396, 17389, -30614, -14259, 10550, 31093, 2261, -4746, 8554, - 21485, -20504, -17470, 19142, 372, -27280, -28087, -24199, 7779, 28257, 12599, -23821, 7340, -18292, - 3337, 3627, -5061, -14939, -23028, -22738, -25697, 28600, 30674, 1012, -10570, -11201, -30471, 1801, - -17121, 13564, -20440, -3455, -5273, -2713, -8498, 7960, 22879, -22443, 15796, -6630, 16515, -14409, - 10388, -13336, -30274, -25968, -17714, 32137, -13058, -6867, 9172, -3138, -1112, 18639, -25494, 28152, - 18814, -21134, 272, 1515, 13155, 18752, 18220, 980, 28534, 19708, -29178, 20987, 14199, -10974, - 30958, -11327, -19457, 17667, -2113, 28146, -1203, 22769, -13659, -24395, 29005, 10659, -2012, -6726, - -23091, 26099, 19261, 14849, 29738, 21460, 12000, -18955, 30567, 100, 8141, -5493, 24728, -22556, - -13896, -23812, -13306, -8324, -2449, 11310, -26700, -29306, 27021, 20309, -24345, 13695, -30537, 22030, - 24640, 16640, 21705, -23679, -2086, 25100, 4859, 813, 19767, 8648, -30717, -16019, -17767, -1595, - 540, -510, -25483, 2300, 8733, 24273, -16027, -8274, 22942, -9315, 470, 17999, 23952, -10803, - -32442, -18807, -25590, -4115, -14192, -12632, -15070, -9202, 11476, -17536, -25700, -13880, 13583, 29720, - 4346, -17236, 12754, 10722, 31310, -24261, 32230, -14629, 19472, -26051, -13610, -26885, -23969, 18728, - 28607, 27337, 11647, -24285, 4356, 10403, 1573, -17695, 28065, 7519, -14104, 27834, -12524, -13034, - -31714, 5090, 21014, -9697, 15918, 3915, 27212, -7628, -14423, 32483, -1847, -15930, -15586, -6558, - -18515, -23201, 4083, 5980, 713, -22026, 19906, -17364, -32361, -18474, 1397, -26682, -13170, 754, - -20046, -4934, 270, 31119, -16576, -32629, -12164, 6939, -22327, -10178, -653, 930, -11057, -20059, - 12938, -14314, -4536, -25456, 13910, 860, -29303, -10880, 9696, -12062, 21220, 22519, 18392, 4181, - 7514, -26464, 14478, -17960, 5853, -19812, 31657, 9232, 25658, 29596, -1761, -18967, -23488, -10833, - -6106, 32187, -23524, -14674, -30985, 9276, 19314, 19943, -9286, 7855, 12659, 12211, -32332, -8174, - 19083, 30645, 4636, -30053, -13189, -17538, -18910, 10856, 20465, 29243, -11664, 14820, -23933, -23754, - -14125, 29551, 9521, -3564, 22762, -24494, -16540, 7168, 11506, 31326, -16597, -3584, -14775, -21436, - -24978, -3023, -17250, -20069, 27899, -26517, 12839, -14651, -3597, -19259, 24204, -16843, -3203, -18380, - 10597, -20561, -4370, -5895, -8085, 29908, -27333, 4657, 19384, 27254, -22872, 321, 32392, 28227, - 4516, -10726, -18010, -26069, -31721, -26145, 28942, 9978, 20523, -23812, -17273, 26703, -943, -1150, - 23011, -9135, 30759, -7177, -3091, -21409, -25679, -20684, 21430, -27603, -22801, 13125, 26897, -10061, - 32013, 8841, 2273, -5075, 8206, -7036, 14563, 10589, 16462, 30985, -12022, 18995, 2365, 16871, - 31869, -7584, -2697, -31770, 16153, 28108, 23916, -151, 713, 2432, -23120, -12781, 22484, -32191, - -5669, -32695, 22663, 19890, -29109, -25427, -23986, 13955, 10100, 11714, 8822, -32591, -23562, 961, - 4633, 15999, 4938, -10036, 18286, -4961, 23994, -1350, -29666, -12520, -17992, -32617, 14307, 23366, - -22999, -534, 9277, 32440, -16817, 16784, -31578, -15286, -31657, 30448, -4858, -9023, 21924, -28444, - -18055, -26602, -27938, 14465, 22144, 19846, -22396, 4085, 1077, -1874, 3244, -8931, -20038, -8457, - 5011, 26771, 11296, -23151, -23189, 11273, -28716, -6577, -10716, -6779, 15759, -4884, -6147, -24827, - -22326, -1603, 26859, 14330, 8767, 18296, -3191, 9137, -15494, 18505, 20533, 28157, -6181, 28860, - 6154, 7541, 21273, -29515, -30077, -25091, 31643, 27294, -19697, 12623, -23540, 4421, 26299, 6517, - -647, -1517, -20373, -5126, 1592, -2084, -23190, -9129, 31698, 25093, 5976, -28177, 28716, -19241, - 10317, 3251, -25625, -31294, 18307, 18445, 9567, 24487, -4115, -16900, -32027, -19005, 141, -8244, - -18408, 3932, 31042, -30661, -15246, 2674, -28089, -1347, -15379, 14444, 24810, 24830, -16195, -20752, - -26093, -28129, -30499, -24732, -13909, -4473, 13848, 23473, -21199, 29581, 7968, 2186, -20360, 15564, - -15113, 11520, 26051, 5716, 16246, 11565, 31234, 1793, -3180, -15847, 32185, 29230, 24361, 16114, - -1014, -9327, -13272, -15295, -10535, 6563, -10573, -10325, 23601, 21833, 13039, -24083, -29289, 13155, - -4956, -37, -2375, 17015, 31507, 19561, -6328, -24872, -8496, -2299, -23401, 12155, -21818, -30931, - -17541, -7876, -24249, -14854, 10128, 24187, -25843, 15694, -19058, -26247, -8720, 17535, 8267, -18887, - -18659, -10015, -15686, 16721, -23845, 23908, 23297, 6173, 14922, 10756, -24271, -14917, 18057, 6405, - 16469, -8407, 19495, -7238, -9267, 13319, 22108, -4618, 18984, -29761, -18142, -5589, -11490, 26579, - -28385, 22568, -27243, -20434, -14536, -11999, -27196, -13301, 25413, -22097, 451, 19363, 29274, -6682, - 18095, 28483, -28595, -19179, 19843, -9072, -7431, 6112, 14638, -30009, 24465, 6547, 30307, -14561, - 28159, -31184, 1023, 25814, -13501, -4198, 22347, 655, -15243, -29898, 16948, -23585, 7953, -4006, - 23384, 4507, -29151, 1520, 30802, -32677, 6522, -5694, -6005, 15226, 26805, 12763, 17320, -17659, - 10358, 7663, 11628, -6802, -4473, 6095, 18004, 24038, 17207, -10296, -25058, -7953, -27864, -18548, - -17509, 26843, 7792, -15219, 24332, -29740, -23937, 22338, 20265, 7096, -2883, 11356, -31773, 24201, - 29844, -20123, -8804, -14104, 12112, -25763, 27108, -4160, 32095, 5964, 3742, -15727, 13813, 4610, - 20491, 17581, -15063, 27679, -26183, 4256, 8442, 9570, 31279, 28586, 11692, -1621, 7890, -28652, - 25471, 25328, 18392, 2075, 30578, 23803, -11000, -29524, -8733, -27167, -11813, 19255, -15419, -8495, - -8324, 30800, 31706, -14566, -13081, 24398, -24084, -25903, -15883, -31068, -8708, 6131, 8395, -4528, - 12258, -138, 7395, 31963, 17479, 7763, -4544, 21008, -28798, -31986, 14890, 961, 10781, 24195, - 26453, 25746, -24227, -22234, 32017, 31018, 4066, 18189, -3206, 7905, -24136, -12876, 16252, 3641, - 22714, -25196, -154, 23525, 27082, -12904, 22324, 27370, 23607, -4218, -9083, 29280, -31541, 11882, - 16846, 9707, -30804, -30329, 27174, -9626, 19303, -9032, 15445, 18198, -20815, -9339, -24706, 31661, - 28719, -28228, 26319, 5342, -5599, -20866, -31757, -19742, 15524, 9184, 18988, -8982, 16530, -8406, - -21147, 16215, -5545, 19938, -21850, -17862, 26836, -24736, -22561, -21248, 16269, 19612, 29222, 13723, - -8164, 9946, 8503, -20526, 28551, 23050, -23180, -9756, 31389, 3910, 30496, 29654, -13061, -3366, - -25776, -277, 23556, -7798, -30089, -1408, -20334, -2330, -4300, -6375, 23259, 16023, 25906, -15297, - -13467, 21049, -9044, 26007, 25621, 5437, 14806, 18259, -7, -11221, -27824, 5684, -3504, -2441, - -11161, 22771, 22277, -15882, 1554, -1930, 6085, -29238, 153, -606, 12592, -16239, 28966, 10844, - 7519, 1065, -30102, 10104, 12248, 26217, 25322, 11666, -16304, 29319, 30762, 23248, 29020, -20060, - 20534, -13493, -17450, 22214, -25456, 15426, -6882, -15711, 15146, 15695, -12057, 32166, -7035, 13018, - -30090, -26097, -13402, 2081, -9773, 29477, 13815, 2758, -9456, 20301, 5129, 5412, -32449, 20104, - -26053, -17757, -6691, 20694, 10982, -30925, 27503, 20871, 12087, 6037, -31177, 32691, -24580, 29657, - -8604, 13437, 20597, 24681, 14757, 30601, -18387, 30526, 2182, 28644, 16859, -20130, 4335, -20796, - -13213, 9675, 7460, 13975, 3468, 9366, -30876, 18345, -26420, -30875, -31325, 13994, -32702, -13026, - 9162, 20136, -26642, 27441, 2486, -22883, 30918, -18396, 8041, 17407, -7186, 14600, 20137, 20238, - -26763, 1011, -6075, -27924, -4461, 19109, -27073, 20715, 27629, -9680, -15277, -6493, -7625, -516, - -2540, 2682, -22826, -31768, 9769, -17863, -26210, -4062, 8733, 15616, -26149, -27876, 30675, 2194, - 25003, -6702, -15774, -593, -9831, 25867, -31614, -1681, 30185, 20804, 4865, 4885, -31145, -8960, - 25811, -16957, -25059, 10006, 30070, -27271, -12081, 16056, 32484, 1521, 7257, 503, 6951, 12680, - -17056, -12971, 2765, 6414, 12115, -6022, 30494, -20934, -32027, -5536, 10526, 15056, -13914, -27407, - -29215, -30531, 12042, 25402, -28482, 7738, 27774, 9992, 20202, -12835, -30625, -24426, 12654, -4112, - 14256, -31933, 23217, 22906, -28983, 22301, -13823, 21052, -18452, -28316, 21386, 18065, 27168, 7607, - 19404, -4999, 24294, 18767, -3077, 18245, -14590, -20042, 32667, -25889, 28098, -13659, 1457, 30152, - 5406, 7314, 9118, 1436, 2239, 7458, -11951, -13835, -27789, -24030, -21847, 25462, 10470, -6869, - -21809, 31632, -24597, 28674, 26418, -21964, -22761, -10092, -27414, -29677, 13886, 18813, -28419, -24019, - -3030, -14182, -11805, -1860, -12395, 27709, -31922, 29009, 1965, 32402, 2316, 10966, 11692, 13785, - -18267, -10067, 13116, -30822, -29668, -17732, -13091, 25692, 18837, -14333, 11288, 27389, -9154, -4882, - 31243, 15407, -7061, -13573, 3355, -16847, 21411, -29240, -8721, 10859, -12169, 16002, 9502, 31100, - -27711, -4604, 28380, 31803, 4994, -11019, -19478, 9970, -31026, 27438, 2720, 23420, -11448, -26757, - 31799, 5031, 31064, -22827, 10687, -31821, 4086, 8040, -4028, 23433, -32527, -20875, -7831, 29953, - 8, -29995, -26822, 32748, 2031, 10540, 12823, 26529, 7272, 6884, -32146, -19135, 19748, -12003, - -299, -617, -32021, 8076, -3004, -23398, -20696, -28424, 407, 3176, 15183, 31338, -20488, -32116, - 17431, 2520, 1858, 10268, 21499, -8580, -6212, -11595, -8199, -19186, -2511, -25146, -19028, 20615, - 4650, -14332, 30279, -31819, 14911, 8753, -19426, -1500, 10684, 20295, -15926, -32646, -23997, 24732, - -7790, -10906, 23164, -7297, -19071, -24972, 12053, 29542, -14014, -10358, 28361, 28358, -5345, 5656, - 12737, -20444, -11192, 4499, -13033, 9423, 15473, -6197, -11477, -24132, -22999, 3294, 12843, 23488, - -22592, 17694, 1116, -31017, 3559, -8669, 3749, 19644, -15, 1019, -30080, -28438, 28565, 6703, - 20814, -14883, 8207, 30841, 4757, -28851, 2912, -15670, 2200, -1691, -4344, -4849, -10692, 6799, - 31408, -1891, 5590, -16022, -14433, -18682, 15898, 9188, 28944, 11226, -24813, -11746, -10191, -32572, - 3915, 17252, 20390, 32119, -22752, 24422, -19415, 18263, 19205, 26845, 30540, -18021, -17102, 6958, - 19814, 8233, 10008, -22354, 1954, 12258, -31944, -22252, -2250, -1132, 2199, -16354, 6990, 26851, - 19930, -13363, -22413, -24830, 29727, -17177, -9270, 7914, -31330, 1845, 32474, 4194, 28495, 28261, - 25845, 32422, 10140, 9601, -7818, -19500, 14569, 3890, -29267, -12867, 30961, 18835, -14165, 2147, - 12751, -18407, -10020, 5990, 1006, 20350, -4938, 8521, -7952, -5919, -21426, 31663, -29606, -1320, - -7695, 23556, 5070, -13452, -31008, 2873, -3796, 32198, 26771, -11828, 30389, -12672, -21476, -8312, - -21547, 8834, -21552, 18945, 6644, -10235, -9847, -6666, 22020, 25547, 15942, -28049, -13648, -28248, - 5042, 12111, -11094, 16592, -23241, -30828, -26017, 4606, -29415, -31009, 22834, -20391, -222, 2276, - -384, 31430, 14211, 14539, -3111, 19179, 3043, -15641, 6284, 24979, 714, -28216, 7688, -26914, - -9063, 12562, 7734, 10361, -7962, 27838, -5784, -1574, 18089, -17538, -31685, 24385, 29618, -2640, - -11118, -25481, 28243, 15421, -22998, -31018, 23536, -15877, 22160, 8646, 1155, -15612, -7609, 1276, - 15550, 4121, -2007, 4769, 1355, 26105, -28117, 9226, 23985, 31896, 13528, 2701, -29226, 25925, - -9407, 6332, 25713, 9356, 28098, -3138, -10946, 29429, -5644, -24341, -17701, 26999, -26714, -24212, - 29545, 25315, -22259, -32631, 7710, -21436, -18867, 717, -23080, 24045, 9669, -26819, -1978, -9108, - 25945, -22957, 13003, 26071, 12974, -26402, -5129, -25643, -16833, -28776, 13084, 21294, -2536, 7399, - 14748, -29506, -18676, -18114, 18832, 31356, 20586, -28240, -21400, -27869, -3775, -1119, -7198, 7740, - 31333, 14122, 12796, 22339, -15000, -20731, -13325, 711, 27949, -1515, -11669, -16216, -18479, 22468, - 4002, -14883, 27357, 16479, 23040, 27424, 26035, 3444, 9231, 5671, -9290, 11833, 19976, -17984, - 14026, -22989, -5445, -5964, 31886, 25620, -9948, 12136, 31059, 3142, 18578, 24378, 30281, -32046, - 5905, 19297, 8925, 15880, -4287, -16874, -11792, -12982, 10110, -10745, -20725, -1788, -2242, 30547, - 28478, 7250, 4121, 10338, -12125, -19344, -28060, 9480, 17506, -22165, -32252, -5992, -20509, 14419, - -7687, -1528, -313, -19174, 12672, -3484, -29773, 20819, -26938, 14750, -28112, 4182, 1744, 20791, - -2639, 2156, 8160, 19440, -19354, -28020, 3153, 29569, -26267, 27299, -25939, 30870, 21492, 14231, - 21434, -20336, -24510, -27203, -10343, -19779, -28955, 1951, -5882, 26867, -15252, -14071, 10611, 30321, - -4884, -7729, 26540, 11375, 30748, 28310, -20981, 20520, -22696, -7009, -17040, -18823, -25489, 9954, - -31911, 3683, -21902, 12423, -25006, 10803, -1372, -13697, 3954, -28251, -10418, -14566, -10071, 18591, - 12458, 9853, -29836, 22512, -3954, 11605, -15656, -14313, -3179, 29012, 14007, 30276, 2779, -22254, - -83, -25743, 1589, -21622, 28752, 9344, 14839, 11267, -19627, -26066, -20358, 9531, -23003, -30540, - -26227, 31532, -8678, -31117, -8684, -7043, 9506, -11144, -649, 28027, -1039, 11277, -18224, -11257, - -30883, -19021, 30953, 11131, -1200, 14188, -16621, -3163, -14410, -19687, 32079, 27578, 16465, -20038, - -11683, -9078, 10172, 12439, 32160, 19542, -14576, -25222, 11637, -3947, -28374, 13965, 15354, -21667, - 16155, 26758, 19077, -26770, 1930, 28684, 6510, 6871, -21665, 28011, 29038, 25708, 3707, -79, - -6596, -13404, -23846, 11165, -13153, -5452, -8190, 11982, 2796, -4827, -1051, 1266, -5277, 23999, - -26585, -2040, -22051, -4521, -1225, -25079, -5392, -26223, 28397, -19003, 11335, 24701, 4128, 5124, - -7852, 14889, 23176, -16456, -25346, 6413, 19723, 16898, -27188, 10752, -13640, 15433, -4784, -851, - -2419, 4257, -24130, 8201, -21472, -7932, -3284, -6653, 23133, -21814, -26996, 11640, 4670, 20240, - 3487, 17443, -19753, 20124, -29698, -1995, 8296, 16224, 2411, 21281, -17327, 27542, -8876, 28528, - 28818, -25725, 30171, -73, 7770, 8317, -23519, 14536, 14064, -29733, 27349, -13451, 24370, 11526, - 29495, 12627, 20221, -14688, -9712, -19378, -21676, 13111, -23315, -17619, 15725, 22528, -23063, -30322, - -9571, 22625, -13659, 23939, -16643, 6745, -4567, 25487, 5463, -31564, -2307, 5494, -6885, -11866, - -13268, -18458, -29867, 5981, -26673, 20159, -21249, -10224, 12293, -2224, -5152, -1610, 16007, 9675, - 16808, 14164, -8436, 8217, 7118, -12353, 22597, 8961, 24999, -25113, 7077, -27593, -6244, -15840, - 9815, -22790, -18446, -1940, 26712, 30864, -26373, -12442, 13122, -29235, -8717, 7651, 17596, -27076, - 120, -25923, -14597, 3775, -6584, 5111, -866, -10848, 944, 7814, 10432, 5220, 20607, 10929, - 30193, -18457, -9231, 9566, 10884, 20292, -30923, -1800, 11130, -6630, 24757, 21420, -25268, -17769, - 25592, 14012, -21534, -15087, -22966, 1189, 22275, 26377, 1951, 13782, -10581, 386, 32539, 22078, - -5064, -26901, 5, 24796, 21951, 5068, 26016, 16554, -1974, -1682, 28399, 13570, 26271, -22459, - 15428, -32493, -5463, -5234, 17079, -15887, 7723, -2022, 17076, -20726, 20202, -8514, -24843, -11328, - 31722, 25532, 2811, 5854, -12900, 7984, 7058, -22557, -5340, 22836, -30404, 30089, 10617, -24053, - 4730, 23850, -6605, 14405, 27175, -16454, 3751, -22395, -18087, 17858, 16862, 27196, -3384, -19039, - 2147, -1374, -29561, 525, -760, -4906, -20839, -2911, 8651, -11184, -28438, 27234, -19122, 12593, - -5436, -24959, -3196, 11363, 22204, -21937, -27090, 16374, 21910, -10510, -25626, 32403, -8560, -22640, - -11840, 5218, -18298, 20950, -1833, -30105, 28834, 28001, 22472, 3481, 14234, -883, 19397, -19798, - -5865, 24219, 24195, -8624, 17268, 10928, -7761, -14364, -15707, 26290, 12605, -32402, 2111, -23578, - -2949, -16946, -10302, -23026, 6959, -5532, 19047, -23667, -4250, 9368, 7896, -7997, 2454, 21274, - -32318, 30907, 12343, 7511, -12188, -421, -9187, -15554, 19011, 5260, 5949, -15678, 6087, 24278, - 16521, -22054, -6910, -2442, 29518, 10937, -16383, 10913, -18011, -23719, -12091, 29295, 13399, -29940, - 25671, -27526, 24574, -11211, 3391, 5054, 1833, 29508, -11252, 19231, -27912, 15281, -26149, 25507, - 28310, -2993, 19369, 23057, -14734, 10906, -3426, -2604, -16109, -2349, 15816, -5305, 16767, -27813, - -25222, 25575, -22322, -27818, -17641, -17623, -26729, 19541, -25814, 10212, -12353, -11100, -10050, -401, - -25653, -12309, -24161, 9695, 16845, -11392, 27290, 3509, 4208, -18432, 26278, 31473, 8758, -9565, - -8183, 25047, -1913, -1681, -15776, -6487, -27186, -22317, 20725, -3899, 27187, 16388, 8480, 28753, - -10993, -24598, -30982, -11447, -26476, 1482, -24553, 26859, 28642, -19458, -32130, -24556, -15655, 20466, - -27512, -6953, -32361, -13016, -32389, -10911, -13193, 1832, 20121, 32168, -24234, 8105, -20236, -17010, - -6577, -30117, 27218, 19481, 30543, 6431, 13580, -3263, 14983, -17514, -14366, -25377, 4463, 14774, - -21620, 31451, -3644, 25809, 18972, 21936, 21708, -2646, -18398, 14711, 22351, -8245, 22256, -20862, - -12644, -3458, 6730, -24894, -5163, -18740, -15992, 17781, -28735, -10363, -25818, 7500, 19172, 25292, - -26079, 4017, -31633, -12733, 5899, 21885, -5458, -13289, 7180, 24664, 29851, 24403, 21714, -3064, - 17318, 5820, 10138, -32615, 21643, -6700, 11248, -15275, -9648, 15763, -24530, 10987, -30776, -27958, - -32190, -1910, 14296, -27339, -12022, -14880, 25656, 14641, -6212, -25080, -16612, 10172, -12993, 27074, - 15411, -19200, 28386, -5595, 562, -19176, -18832, -4331, -25984, -11184, -28867, -5568, 32270, 30329, - 31999, 5171, -2990, 5189, -26579, -977, -1150, 7552, -15369, -20998, 21953, 27155, 17522, 22281, - -25027, 15175, -24608, 12771, -26835, 28039, -4649, 15317, 571, 19424, 11319, 2126, 20090, 16755, - -13623, 17897, 1182, 7190, 5883, -6349, -23697, -6373, -27787, 29421, -20695, -3140, -21859, -10391, - -7434, 4248, 16096, -3110, -12434, -30937, 10180, 6634, 22943, 29236, -12460, -3780, 8290, -21755, - 27136, -926, -18078, 18726, 9046, 32642, 22527, -19423, 25814, -5835, -32293, -19919, -24368, 20332, - 29319, -24225, 28615, 19306, 15191, 18205, -16552, -16481, 7256, -27152, 21488, -907, 30876, -32160, - -14545, 32510, -23330, 6923, 21046, 15065, -24241, -3151, -11371, -9890, -2810, 1171, 977, 17103, - -6813, 19890, 26252, -5057, 29249, 17254, -12246, 29724, 1204, 29088, 12722, -12365, 15271, -2926, - -19677, -24499, 21572, -1067, 32022, 29789, 25952, -12915, 3697, -4666, 13911, -9100, -26332, -16514, - 1041, 29412, -21590, 25895, -28780, -11364, -25298, 23101, -15303, 18621, 31241, -2431, 1491, -21928, - 912, 18311, 9931, 2850, -27301, -8027, 8812, -15911, 15312, 22494, -7567, -12340, -20196, -2668, - 5259, -17146, 3641, 10899, -15677, 24581, 989, -16234, 5339, -21192, -17303, -23012, 7098, 8798, - -32343, 25384, 19720, -13718, 11385, 16665, -1084, -21487, -2948, 9763, 17951, 18882, -31590, -321, - -11217, -10770, 10327, 1795, -6777, -23762, 22042, -15570, -21771, 12171, 32364, 32545, -25008, 24175, - 10810, -32447, 12646, -10836, -732, 865, -27929, 14282, 7864, 4391, -28828, 9259, 9244, -14233, - -26517, 27703, 11574, -21138, 11837, -31330, -10245, 17888, 26826, 32487, 9048, -14627, 93, 5696, - 10093, 28810, -24909, -3595, -16169, 23736, -31152, 26574, -27613, -14802, -13402, -29931, 12711, 12573, - 28977, -20245, -11917, 21973, 31825, 16587, 9727, -28639, 30883, -19936, -15337, 22647, 11105, 14994, - -2274, 4341, 17143, 28387, -4835, 11580, 27322, 27191, -32136, 24076, 15896, 4391, 23204, 25804, - -31328, 11379, 7960, -4040, 29517, -9741, -12785, 18198, -30112, -26969, 25445, -27991, 19098, -9013, - 17900, -8433, 6720, -15059, 11091, 13079, -24965, 4866, -9897, 17792, 20154, -17720, 12987, 1887, - -32619, 22138, -26108, -6772, 32673, 11230, -12506, -20796, 2990, 9037, -4163, -19489, 19294, 31661, - 16832, -4469, -31789, -32676, 30485, 32629, 24362, 29434, 13129, 3772, -24723, 1708, 23825, 28813, - 14381, 19931, -9803, -25215, 466, -17591, -15432, -17191, -1726, -29992, 13519, -15727, -15054, -24538, - -11537, 1623, 2000, 15856, 5528, -15409, 17815, 20368, 32608, -12626, -118, 10265, -16578, 22306, - -21902, -31452, -17518, 10614, 14059, 1618, 32552, -3519, -21390, 31261, 14321, -9661, -18998, 1088, - 19266, -19441, 11549, -3727, 9154, 25524, 32691, -1745, -11347, 19007, 2411, 28834, 11763, 29072, - 29408, 27772, -6405, -17763, -10271, 19415, -7003, -11115, -3060, -23921, -680, 15493, 29612, -18536, - -13424, -2448, -6270, -19524, 11426, 30041, 32114, -1230, 27608, 23677, -10662, 1534, 7869, 1183, - -20537, -21016, 20304, -2125, 21004, -2966, -29543, -29147, 4328, -17097, -24867, -2046, -19748, -20467, - -19247, -6271, 10367, 30419, 16951, 27577, 18058, 27375, -1449, 9961, 28497, 24799, 6456, -25939, - 30081, -1633, 3691, -32694, -14586, -23474, 18571, -7209, -20509, -6334, -12074, -4362, 11038, 25662, - 158, 21124, 12715, 14751, 30082, -1767, 14007, -2462, -6700, -27723, -11026, 10420, -18646, 29837, - 21870, 28143, -23229, 24962, 6430, 974, 23745, -9682, -29169, 26478, -9102, -22438, 2922, -13599, - -13919, 19694, -32692, -32274, 25488, 23838, -27993, 19496, -12506, -14420, -5136, 30586, 11489, 5811, - -309, 8886, 20595, -13465, -1367, -15277, 28850, -26556, 1382, -6583, 18620, -31942, 22490, 26973, - -20212, 11053, -22042, 32517, 2269, 2520, -24119, 4478, -8496, 31527, -29682, 11648, -23544, 15705, - 7422, -14021, 24938, 22229, -14866, 27997, -32583, -942, 33, -20123, -11898, 10677, 7913, -18304, - -31230, -28901, -5940, -73, 25069, -8638, -26803, -29153, 7535, 19585, -7830, 30194, 23300, -5885, - -6154, -15737, 10959, -32217, 11445, 30723, 28885, 24364, 11394, 11697, -17640, -3903, 22753, -2487, - 17484, -22161, -14162, 15165, -29190, -20530, -6340, -12867, -6847, 9477, 28706, 21953, -2412, 16256, - -4351, -11688, 23469, -13915, 12476, -28253, -17824, 2344, -28550, -18579, 26995, 17963, 9352, 29858, - -12041, -23171, 31568, -32367, -16520, -6412, -1132, 5068, 544, -30553, 32231, -27297, -2875, -29057, - -25338, -3229, -17764, -6228, 21520, 12038, 10437, -28977, 16608, 24530, -31838, 2300, 15184, -11134, - -5251, -31828, -4517, 7377, -28618, -1126, 29035, -2089, -17564, 10122, 21512, -695, -9825, 16352, - -24477, 14377, -15468, 24411, 31380, 2958, -30994, -27026, 27507, 24126, 14026, 30247, -19017, 25727, - 4041, 30009, 12612, 30124, 12934, 22198, -23154, 1128, -5483, -12015, -17047, 18310, -25061, 979, - 27336, -30457, -14375, 14468, -30583, 6059, -11779, 21451, 6652, -8430, 6564, -19374, -24767, -17465, - -26341, 32239, -9470, 24584, 14722, -22512, -26375, 31698, -15959, -31087, -8827, 7563, -10943, -15679, - -21797, 23360, 23472, 4123, -7940, 16879, -21632, 954, 22639, -29092, 9894, 20825, -24144, -30242, - -20200, 5116, -1636, 19928, 18965, -13831, -21216, 10330, -17188, -30857, 29705, 8701, 8304, 15283, - 11191, 24489, -5646, -18181, 7123, -32728, 18695, 17573, 12538, -6063, 29361, 18125, 17206, 16301, - -3906, -13454, -3731, -1680, -16239, 2898, -8952, -11999, -31862, 3820, -10151, -15792, -31057, 11376, - 25935, 14124, -32158, -18654, 30559, -32002, 3649, 2657, -30641, -8324, 16774, -1374, 16768, 24561, - 17931, -15981, -4562, -13693, 10469, -5003, 16752, -18287, -32279, 24465, -22887, 7172, -28841, 1686, - -7970, -15495, -11602, 24764, -5549, 13686, -11440, -29481, 16484, 7628, 675, -9821, 15555, 20104, - -12762, 12873, -28098, 10669, -12930, 19970, -31282, -23978, 2875, -27096, -1382, -29211, 1561, 26413, - 24519, 30996, -4968, -3852, 3277, 27850, 25535, 30910, 11756, -13961, -30635, 28290, -29362, 16152, - 3412, 9626, -15394, 16830, 11221, -11736, -30262, -14514, 23933, 6498, -6522, 3268, 6427, -6963, - 32431, -21675, -24076, 22501, -21791, -24449, -30680, -9193, -5414, 13759, 10893, 7941, -3811, -3407, - -13349, -18564, -31742, -14826, -31417, -25049, 8756, -2704, -7009, -23369, 26219, 27108, 8079, -8913, - 16338, 21626, -23695, 31695, -29632, 17900, 3527, 28953, -13898, 6624, -4239, -22956, -23978, -24584, - 18456, -24838, -3397, -10174, 21701, -30669, -17165, -9840, -30793, -15863, 797, 1802, -27107, 7637, - -21166, -12057, 20182, -9754, -7283, -16819, 5265, 29933, -27507, 16044, -22070, -13090, 11335, 6632, - -15950, -29709, 936, 958, 1083, 28004, 13930, -6483, 10178, -15104, 3180, -28938, -30723, 2419, - 18522, 3257, -26761, -15895, -17816, -21316, -32060, -10594, 7726, -31331, 3758, 11042, 26013, 3893, - -27378, -552, -10810, 21046, 27414, -16990, 10096, -10203, -25913, 1489, 20204, -8052, -32157, -4039, - 12006, -14845, 29538, -23046, 28821, 6111, -14879, 31258, -15977, -2340, 31473, -23648, 11489, 30069, - 25033, -983, 15229, -5929, -18366, 27890, -7056, 21675, -25862, 31516, -2988, -29909, -21833, -11010, - 10157, -31021, 19411, 5736, 16198, -29290, 11040, 30435, 32333, 5886, 28165, 5161, -6729, -6274, - -3030, -5996, 16745, 12453, -22320, 6517, -22442, -11386, -27798, -21402, -7708, -28034, 5190, 1499, - 28653, -16219, 26507, -29712, -30796, 12590, -226, 7860, -15959, 14143, 23161, -6048, -12206, 19133, - 2170, -22026, -14422, 22069, 22807, -24197, 4014, 5583, -32145, -22189, 4682, 1911, -16654, 7155, - 30213, -6298, 9948, -20266}; - -const static Activation layer_activation_relu(ReLU); -const static Activation layer_activation_relu_s16(ReLU); - -const static __attribute__((aligned(16))) int8_t layer_activation_lrelu_element[] = {45}; -const static Activation layer_activation_lrelu(LeakyReLU, - layer_activation_lrelu_element, - -8, - { - 1, - }); -const static __attribute__((aligned(16))) int16_t layer_activation_lrelu_element_s16[] = {45}; -const static Activation layer_activation_lrelu_s16(LeakyReLU, - layer_activation_lrelu_element_s16, - -16, - { - 1, - }); - -const static __attribute__((aligned(16))) int8_t layer_activation_prelu_element[] = { - 105, -45, 57, -43, -83, -118, -112, 31, 57, 108, -81, 112, 51, 39, -101, 9, -51, -79, -38, -90, 19, 58, - -88, -53, 0, -43, -90, 66, 55, -26, -124, 49, 93, 101, 57, -9, -71, -35, 26, -127, -73, 70, -111, 30, - 95, -58, 59, 7, 66, 59, 27, -63, -9, 23, -63, -84, -47, 7, 100, 31, 83, 46, 64, 39}; -const static Activation layer_activation_prelu(PReLU, - layer_activation_prelu_element, - -8, - { - 64, - }); -const static __attribute__((aligned(16))) int16_t layer_activation_prelu_element_s16[] = { - -23933, -23754, -14125, 29551, 9521, -3564, 22762, -24494, -16540, 7168, 11506, 31326, -16597, - -3584, -14775, -21436, 682, 23594, -3895, 22899, 28855, -21727, 28286, -1898, -7434, 160, - -24794, -28018, -7888, -7696, 11154, -22546, 5224, 8034, -30821, 10611, -27672, 13500, -16067, - 7163, -20827, -15563, 6647, 8437, -6567, 30548, -24065, 17366, -2990, 5189, -26579, -977, - -1150, 7552, -15369, -20998, 21953, 27155, 17522, 22281, -25027, 15175, -24608, 12771}; -const static Activation layer_activation_prelu_s16(PReLU, - layer_activation_prelu_element_s16, - -16, - { - 64, - }); diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_layer.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_layer.cpp deleted file mode 100644 index 1916424c..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_layer.cpp +++ /dev/null @@ -1,699 +0,0 @@ -#include "test_data.hpp" - -#include "dl_base.hpp" -#include "dl_define.hpp" -#include "dl_tool.hpp" - -#include "dl_layer_add2d.hpp" -#include "dl_layer_avg_pool2d.hpp" -#include "dl_layer_concat.hpp" -#include "dl_layer_conv2d.hpp" -#include "dl_layer_depthwise_conv2d.hpp" -#include "dl_layer_expand_dims.hpp" -#include "dl_layer_flatten.hpp" -#include "dl_layer_fullyconnected.hpp" -#include "dl_layer_global_avg_pool2d.hpp" -#include "dl_layer_global_max_pool2d.hpp" -#include "dl_layer_leakyrelu.hpp" -#include "dl_layer_max2d.hpp" -#include "dl_layer_max_pool2d.hpp" -#include "dl_layer_min2d.hpp" -#include "dl_layer_mul2d.hpp" -#include "dl_layer_pad.hpp" -#include "dl_layer_prelu.hpp" -#include "dl_layer_relu.hpp" -#include "dl_layer_reshape.hpp" -#include "dl_layer_squeeze.hpp" -#include "dl_layer_sub2d.hpp" -#include "dl_layer_transpose.hpp" - -#include "unity.h" -#include - -using namespace dl; -using namespace nn; -using namespace layer; -using namespace tool; -using namespace base; -using namespace std; - -int exponent0 = 0; -int exponent1 = 0; -int output_exponent = 0; - -int height = 3; -int width = 3; -int channel = 16; - -bool test_add_layer() -{ - printf("\n add\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value(1); - Tensor input1_s8; - input1_s8.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s8.set_value(2); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value(1); - Tensor input1_s16; - input1_s16.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s16.set_value(2); - - Add2D _add2d_s8_1(output_exponent, NULL, "add2d_s8 inplace", true); - Add2D _add2d_s8_2(output_exponent, NULL, "add2d_s8 no inplace", false); - Add2D _add2d_s16_1(output_exponent, NULL, "add2d_s16 inplace", true); - Add2D _add2d_s16_2(output_exponent, NULL, "add2d_s16 no inplace", false); - - _add2d_s8_1.build(input0_s8, input1_s8, true); - _add2d_s8_2.build(input0_s8, input1_s8, true); - _add2d_s16_1.build(input0_s16, input1_s16, true); - _add2d_s16_2.build(input0_s16, input1_s16, true); - - _add2d_s8_1.call(input0_s8, input1_s8); - _add2d_s8_1.get_output().print({}, "add2d_s8 inplace"); - _add2d_s8_2.call(input0_s8, input1_s8); - _add2d_s8_2.get_output().print({}, "add2d_s8 no inplace"); - _add2d_s16_1.call(input0_s16, input1_s16); - _add2d_s16_1.get_output().print({}, "add2d_s16 inplace"); - _add2d_s16_2.call(input0_s16, input1_s16); - _add2d_s16_2.get_output().print({}, "add2d_s16 no inplace"); - - return true; -} - -bool test_sub_layer() -{ - printf("\n sub\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value(1); - Tensor input1_s8; - input1_s8.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s8.set_value(2); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value(1); - Tensor input1_s16; - input1_s16.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s16.set_value(2); - - Sub2D _sub2d_s8_1(output_exponent, NULL, "sub2d_s8 inplace", true); - Sub2D _sub2d_s8_2(output_exponent, NULL, "sub2d_s8 no inplace", false); - Sub2D _sub2d_s16_1(output_exponent, NULL, "sub2d_s16 inplace", true); - Sub2D _sub2d_s16_2(output_exponent, NULL, "sub2d_s16 no inplace", false); - - _sub2d_s8_1.build(input0_s8, input1_s8, true); - _sub2d_s8_2.build(input0_s8, input1_s8, true); - _sub2d_s16_1.build(input0_s16, input1_s16, true); - _sub2d_s16_2.build(input0_s16, input1_s16, true); - - _sub2d_s8_1.call(input0_s8, input1_s8); - _sub2d_s8_1.get_output().print({}, "sub2d_s8 inplace"); - _sub2d_s8_2.call(input0_s8, input1_s8); - _sub2d_s8_2.get_output().print({}, "sub2d_s8 no inplace"); - _sub2d_s16_1.call(input0_s16, input1_s16); - _sub2d_s16_1.get_output().print({}, "sub2d_s16 inplace"); - _sub2d_s16_2.call(input0_s16, input1_s16); - _sub2d_s16_2.get_output().print({}, "sub2d_s16 no inplace"); - - return true; -} - -bool test_mul_layer() -{ - printf("\n mul\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value(1); - Tensor input1_s8; - input1_s8.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s8.set_value(2); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value(1); - Tensor input1_s16; - input1_s16.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s16.set_value(2); - - Mul2D _mul2d_s8_1(output_exponent, NULL, "mul2d_s8 inplace", true); - Mul2D _mul2d_s8_2(output_exponent, NULL, "mul2d_s8 no inplace", false); - Mul2D _mul2d_s16_1(output_exponent, NULL, "mul2d_s16 inplace", true); - Mul2D _mul2d_s16_2(output_exponent, NULL, "mul2d_s16 no inplace", false); - - _mul2d_s8_1.build(input0_s8, input1_s8, true); - _mul2d_s8_2.build(input0_s8, input1_s8, true); - _mul2d_s16_1.build(input0_s16, input1_s16, true); - _mul2d_s16_2.build(input0_s16, input1_s16, true); - - _mul2d_s8_1.call(input0_s8, input1_s8); - _mul2d_s8_1.get_output().print({}, "mul2d_s8 inplace"); - _mul2d_s8_2.call(input0_s8, input1_s8); - _mul2d_s8_2.get_output().print({}, "mul2d_s8 no inplace"); - _mul2d_s16_1.call(input0_s16, input1_s16); - _mul2d_s16_1.get_output().print({}, "mul2d_s16 inplace"); - _mul2d_s16_2.call(input0_s16, input1_s16); - _mul2d_s16_2.get_output().print({}, "mul2d_s16 no inplace"); - - return true; -} - -bool test_expand_dims_layer() -{ - printf("\n expand_dims\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value(1); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value(2); - Tensor input1_s8(input0_s8, true); - Tensor input1_s16(input0_s16, true); - - ExpandDims _expand_dims_s8_1({0, 1}, "expand_dims_s8 inplace", true); - ExpandDims _expand_dims_s8_2({0, 1}, "expand_dims_s8 no inplace", false); - ExpandDims _expand_dims_s16_1({0, 1}, "expand_dims_s16 inplace", true); - ExpandDims _expand_dims_s16_2({0, 1}, "expand_dims_s16 no inplace", false); - - _expand_dims_s8_1.build(input0_s8, true); - _expand_dims_s8_2.build(input1_s8, true); - _expand_dims_s16_1.build(input0_s16, true); - _expand_dims_s16_2.build(input1_s16, true); - - _expand_dims_s8_2.call(input1_s8); - _expand_dims_s8_2.get_output().print({}, "expand_dims_s8 no inplace"); - _expand_dims_s8_1.call(input0_s8); - _expand_dims_s8_1.get_output().print({}, "expand_dims_s8 inplace"); - _expand_dims_s16_2.call(input1_s16); - _expand_dims_s16_2.get_output().print({}, "expand_dims2d_s16 no inplace"); - _expand_dims_s16_1.call(input0_s16); - _expand_dims_s16_1.get_output().print({}, "expand_dims_s16 inplace"); - - return true; -} - -bool test_flatten_layer() -{ - printf("\n flatten\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value(1); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value(2); - Tensor input1_s8(input0_s8, true); - Tensor input1_s16(input0_s16, true); - - Flatten _flatten_s8_1("flatten_s8 inplace", true); - Flatten _flatten_s8_2("flatten_s8 no inplace", false); - Flatten _flatten_s16_1("flatten_s16 inplace", true); - Flatten _flatten_s16_2("flatten_s16 no inplace", false); - - _flatten_s8_1.build(input0_s8, true); - _flatten_s8_2.build(input1_s8, true); - _flatten_s16_1.build(input0_s16, true); - _flatten_s16_2.build(input1_s16, true); - - _flatten_s8_2.call(input1_s8); - _flatten_s8_2.get_output().print({}, "flatten_s8 no inplace"); - _flatten_s8_1.call(input0_s8); - _flatten_s8_1.get_output().print({}, "flatten_s8 inplace"); - _flatten_s16_2.call(input1_s16); - _flatten_s16_2.get_output().print({}, "flatten2d_s16 no inplace"); - _flatten_s16_1.call(input0_s16); - _flatten_s16_1.get_output().print({}, "flatten_s16 inplace"); - - return true; -} - -bool test_squeeze_layer() -{ - printf("\n squeeze\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({1, height, width, channel, 1}).set_auto_free(true).malloc_element(); - input0_s8.set_value(1); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({1, height, width, channel, 1}).set_auto_free(true).malloc_element(); - input0_s16.set_value(2); - Tensor input1_s8(input0_s8, true); - Tensor input1_s16(input0_s16, true); - - Squeeze _squeeze_s8_1(INT32_MAX, "squeeze_s8 inplace", true); - Squeeze _squeeze_s8_2(-1, "squeeze_s8 no inplace", false); - Squeeze _squeeze_s16_1(INT32_MAX, "squeeze_s16 inplace", true); - Squeeze _squeeze_s16_2(-1, "squeeze_s16 no inplace", false); - - _squeeze_s8_1.build(input0_s8, true); - _squeeze_s8_2.build(input1_s8, true); - _squeeze_s16_1.build(input0_s16, true); - _squeeze_s16_2.build(input1_s16, true); - - _squeeze_s8_2.call(input1_s8); - _squeeze_s8_2.get_output().print({}, "squeeze_s8 no inplace"); - _squeeze_s8_1.call(input0_s8); - _squeeze_s8_1.get_output().print({}, "squeeze_s8 inplace"); - _squeeze_s16_2.call(input1_s16); - _squeeze_s16_2.get_output().print({}, "squeeze2d_s16 no inplace"); - _squeeze_s16_1.call(input0_s16); - _squeeze_s16_1.get_output().print({}, "squeeze_s16 inplace"); - - return true; -} - -bool test_reshape_layer() -{ - printf("\n reshape\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value(1); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value(2); - Tensor input1_s8(input0_s8, true); - Tensor input1_s16(input0_s16, true); - - Reshape _reshape_s8_1({-1, width * channel}, "reshape_s8 inplace", true); - Reshape _reshape_s8_2({-1, width * channel}, "reshape_s8 no inplace", false); - Reshape _reshape_s16_1({-1, width * channel}, "reshape_s16 inplace", true); - Reshape _reshape_s16_2({-1, width * channel}, "reshape_s16 no inplace", false); - - _reshape_s8_1.build(input0_s8, true); - _reshape_s8_2.build(input1_s8, true); - _reshape_s16_1.build(input0_s16, true); - _reshape_s16_2.build(input1_s16, true); - - _reshape_s8_2.call(input1_s8); - _reshape_s8_2.get_output().print({}, "reshape_s8 no inplace"); - _reshape_s8_1.call(input0_s8); - _reshape_s8_1.get_output().print({}, "reshape_s8 inplace"); - _reshape_s16_2.call(input1_s16); - _reshape_s16_2.get_output().print({}, "reshape2d_s16 no inplace"); - _reshape_s16_1.call(input0_s16); - _reshape_s16_1.get_output().print({}, "reshape_s16 inplace"); - - return true; -} - -bool test_transpose_layer() -{ - printf("\n transpose\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value(1); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value(2); - Tensor input1_s8(input0_s8, true); - Tensor input1_s16(input0_s16, true); - - Transpose _transpose_s8_1({-1, 0, 1}, "transpose_s8 inplace", true); - Transpose _transpose_s8_2({-1, 0, 1}, "transpose_s8 no inplace", false); - Transpose _transpose_s16_1({-1, 0, 1}, "transpose_s16 inplace", true); - Transpose _transpose_s16_2({-1, 0, 1}, "transpose_s16 no inplace", false); - - _transpose_s8_1.build(input0_s8, true); - _transpose_s8_2.build(input1_s8, true); - _transpose_s16_1.build(input0_s16, true); - _transpose_s16_2.build(input1_s16, true); - - input0_s8.set_shape({height, width, channel}); - input0_s16.set_shape({height, width, channel}); - - _transpose_s8_2.call(input1_s8); - _transpose_s8_2.get_output().print({}, "transpose_s8 no inplace"); - _transpose_s8_1.call(input0_s8); - _transpose_s8_1.get_output().print({}, "transpose_s8 inplace"); - _transpose_s16_2.call(input1_s16); - _transpose_s16_2.get_output().print({}, "transpose2d_s16 no inplace"); - _transpose_s16_1.call(input0_s16); - _transpose_s16_1.get_output().print({}, "transpose_s16 inplace"); - - return true; -} - -bool test_pad_layer() -{ - printf("\n pad\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value(1); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value(2); - Tensor input1_s8(input0_s8, true); - Tensor input1_s16(input0_s16, true); - - Pad _pad_s8_1({1, 1, 1, 1, 1, 1}, {100, 101, 102, 103, 104, 105}, PADDING_CONSTANT, "pad_s8 constant"); - Pad _pad_s8_2({1, 1, 1, 1, 1, 1}, {100, 101, 102, 103, 104, 105}, PADDING_REFLECT, "pad_s8 reflect"); - Pad _pad_s16_1({1, 1, 1, 1, 1, 1}, {100, 101, 102, 103, 104, 105}, PADDING_CONSTANT, "pad_s16 constant"); - Pad _pad_s16_2({1, 1, 1, 1, 1, 1}, {100, 101, 102, 103, 104, 105}, PADDING_REFLECT, "pad_s16 reflect"); - - _pad_s8_1.build(input0_s8, true); - _pad_s8_2.build(input1_s8, true); - _pad_s16_1.build(input0_s16, true); - _pad_s16_2.build(input1_s16, true); - - _pad_s8_1.call(input0_s8); - _pad_s8_1.get_output().print({}, "pad_s8 constant"); - _pad_s8_2.call(input1_s8); - _pad_s8_2.get_output().print({}, "pad_s8 reflect"); - _pad_s16_1.call(input0_s16); - _pad_s16_1.get_output().print({}, "pad2d_s16 constant"); - _pad_s16_2.call(input1_s16); - _pad_s16_2.get_output().print({}, "pad_s16 reflect"); - - return true; -} - -bool test_min_layer() -{ - printf("\n min\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value(1); - Tensor input1_s8; - input1_s8.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s8.set_value(2); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value(1); - Tensor input1_s16; - input1_s16.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s16.set_value(2); - - Min2D _min2d_s8_1("min2d_s8 inplace", true); - Min2D _min2d_s8_2("min2d_s8 no inplace", false); - Min2D _min2d_s16_1("min2d_s16 inplace", true); - Min2D _min2d_s16_2("min2d_s16 no inplace", false); - - _min2d_s8_1.build(input0_s8, input1_s8, true); - _min2d_s8_2.build(input0_s8, input1_s8, true); - _min2d_s16_1.build(input0_s16, input1_s16, true); - _min2d_s16_2.build(input0_s16, input1_s16, true); - - _min2d_s8_1.call(input0_s8, input1_s8); - _min2d_s8_1.get_output().print({}, "min2d_s8 inplace"); - _min2d_s8_2.call(input0_s8, input1_s8); - _min2d_s8_2.get_output().print({}, "min2d_s8 no inplace"); - _min2d_s16_1.call(input0_s16, input1_s16); - _min2d_s16_1.get_output().print({}, "min2d_s16 inplace"); - _min2d_s16_2.call(input0_s16, input1_s16); - _min2d_s16_2.get_output().print({}, "min2d_s16 no inplace"); - - return true; -} - -bool test_max_layer() -{ - printf("\n max\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value(1); - Tensor input1_s8; - input1_s8.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s8.set_value(2); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value(1); - Tensor input1_s16; - input1_s16.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s16.set_value(2); - - Max2D _max2d_s8_1("max2d_s8 inplace", true); - Max2D _max2d_s8_2("max2d_s8 no inplace", false); - Max2D _max2d_s16_1("max2d_s16 inplace", true); - Max2D _max2d_s16_2("max2d_s16 no inplace", false); - - _max2d_s8_1.build(input0_s8, input1_s8, true); - _max2d_s8_2.build(input0_s8, input1_s8, true); - _max2d_s16_1.build(input0_s16, input1_s16, true); - _max2d_s16_2.build(input0_s16, input1_s16, true); - - _max2d_s8_1.call(input0_s8, input1_s8); - _max2d_s8_1.get_output().print({}, "max2d_s8 inplace"); - _max2d_s8_2.call(input0_s8, input1_s8); - _max2d_s8_2.get_output().print({}, "max2d_s8 no inplace"); - _max2d_s16_1.call(input0_s16, input1_s16); - _max2d_s16_1.get_output().print({}, "max2d_s16 inplace"); - _max2d_s16_2.call(input0_s16, input1_s16); - _max2d_s16_2.get_output().print({}, "max2d_s16 no inplace"); - - return true; -} - -bool test_global_max_pool_layer() -{ - printf("\n global_max_pool\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value({0, 1, 0, 3, 0, 16}, 1); - input0_s8.set_value({1, 2, 0, 3, 0, 16}, 2); - input0_s8.set_value({2, 3, 0, 3, 0, 16}, 3); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value({0, 1, 0, 3, 0, 16}, 1); - input0_s16.set_value({1, 2, 0, 3, 0, 16}, 2); - input0_s16.set_value({2, 3, 0, 3, 0, 16}, 3); - - GlobalMaxPool2D _global_max_pool_s8_1("global_max_pool_s8"); - GlobalMaxPool2D _global_max_pool_s16_1("global_max_pool_s16"); - - _global_max_pool_s8_1.build(input0_s8, true); - _global_max_pool_s16_1.build(input0_s16, true); - - _global_max_pool_s8_1.call(input0_s8); - _global_max_pool_s8_1.get_output().print({}, "global_max_pool_s8 constant"); - _global_max_pool_s16_1.call(input0_s16); - _global_max_pool_s16_1.get_output().print({}, "global_max_pool2d_s16 constant"); - - return true; -} - -bool test_global_avg_pool_layer() -{ - printf("\n global_avg_pool\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value({0, 1, 0, 3, 0, 16}, 9); - input0_s8.set_value({1, 2, 0, 3, 0, 16}, 18); - input0_s8.set_value({2, 3, 0, 3, 0, 16}, 27); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value({0, 1, 0, 3, 0, 16}, 9); - input0_s16.set_value({1, 2, 0, 3, 0, 16}, 18); - input0_s16.set_value({2, 3, 0, 3, 0, 16}, 27); - - GlobalAveragePool2D _global_avg_pool_s8_1(output_exponent, "global_avg_pool_s8"); - GlobalAveragePool2D _global_avg_pool_s16_1(output_exponent, "global_avg_pool_s16"); - - _global_avg_pool_s8_1.build(input0_s8, true); - _global_avg_pool_s16_1.build(input0_s16, true); - - _global_avg_pool_s8_1.call(input0_s8); - _global_avg_pool_s8_1.get_output().print({}, "global_avg_pool_s8 constant"); - _global_avg_pool_s16_1.call(input0_s16); - _global_avg_pool_s16_1.get_output().print({}, "global_avg_pool2d_s16 constant"); - - return true; -} - -bool test_relu_layer() -{ - printf("\n relu\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value({0, 1, 0, 3, 0, 16}, -9); - input0_s8.set_value({1, 2, 0, 3, 0, 16}, 18); - input0_s8.set_value({2, 3, 0, 3, 0, 16}, -27); - Tensor input1_s8; - input1_s8.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s8.set_value({0, 1, 0, 3, 0, 16}, 9); - input1_s8.set_value({1, 2, 0, 3, 0, 16}, -18); - input1_s8.set_value({2, 3, 0, 3, 0, 16}, 27); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value({0, 1, 0, 3, 0, 16}, -9); - input0_s16.set_value({1, 2, 0, 3, 0, 16}, 18); - input0_s16.set_value({2, 3, 0, 3, 0, 16}, -27); - Tensor input1_s16; - input1_s16.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s16.set_value({0, 1, 0, 3, 0, 16}, 9); - input1_s16.set_value({1, 2, 0, 3, 0, 16}, -18); - input1_s16.set_value({2, 3, 0, 3, 0, 16}, 27); - - Relu _relu_s8_1("relu_s8 inplace", true); - Relu _relu_s8_2("relu_s8 no inplace", false); - Relu _relu_s16_1("relu_s16 inplace", true); - Relu _relu_s16_2("relu_s16 no inplace", false); - - _relu_s8_1.build(input0_s8, true); - _relu_s8_2.build(input0_s8, true); - _relu_s16_1.build(input0_s16, true); - _relu_s16_2.build(input0_s16, true); - - _relu_s8_1.call(input0_s8); - _relu_s8_1.get_output().print({}, "relu_s8 inplace"); - _relu_s8_2.call(input1_s8); - _relu_s8_2.get_output().print({}, "relu_s8 no inplace"); - _relu_s16_1.call(input0_s16); - _relu_s16_1.get_output().print({}, "relu_s16 inplace"); - _relu_s16_2.call(input1_s16); - _relu_s16_2.get_output().print({}, "relu_s16 no inplace"); - - return true; -} - -bool test_leaky_relu_layer() -{ - printf("\n leaky_relu\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value({0, 1, 0, 3, 0, 16}, -9); - input0_s8.set_value({1, 2, 0, 3, 0, 16}, 18); - input0_s8.set_value({2, 3, 0, 3, 0, 16}, -27); - Tensor input1_s8; - input1_s8.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s8.set_value({0, 1, 0, 3, 0, 16}, 9); - input1_s8.set_value({1, 2, 0, 3, 0, 16}, -18); - input1_s8.set_value({2, 3, 0, 3, 0, 16}, 27); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value({0, 1, 0, 3, 0, 16}, -9); - input0_s16.set_value({1, 2, 0, 3, 0, 16}, 18); - input0_s16.set_value({2, 3, 0, 3, 0, 16}, -27); - Tensor input1_s16; - input1_s16.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s16.set_value({0, 1, 0, 3, 0, 16}, 9); - input1_s16.set_value({1, 2, 0, 3, 0, 16}, -18); - input1_s16.set_value({2, 3, 0, 3, 0, 16}, 27); - - LeakyRelu _leaky_relu_s8_1(2, exponent0, "leaky_relu_s8 inplace", true); - LeakyRelu _leaky_relu_s8_2(2, exponent0, "leaky_relu_s8 no inplace", false); - LeakyRelu _leaky_relu_s16_1(2, exponent0, "leaky_relu_s16 inplace", true); - LeakyRelu _leaky_relu_s16_2(2, exponent0, "leaky_relu_s16 no inplace", false); - - _leaky_relu_s8_1.build(input0_s8, true); - _leaky_relu_s8_2.build(input0_s8, true); - _leaky_relu_s16_1.build(input0_s16, true); - _leaky_relu_s16_2.build(input0_s16, true); - - _leaky_relu_s8_1.call(input0_s8); - _leaky_relu_s8_1.get_output().print({}, "leaky_relu_s8 inplace"); - _leaky_relu_s8_2.call(input1_s8); - _leaky_relu_s8_2.get_output().print({}, "leaky_relu_s8 no inplace"); - _leaky_relu_s16_1.call(input0_s16); - _leaky_relu_s16_1.get_output().print({}, "leaky_relu_s16 inplace"); - _leaky_relu_s16_2.call(input1_s16); - _leaky_relu_s16_2.get_output().print({}, "leaky_relu_s16 no inplace"); - - return true; -} - -bool test_prelu_layer() -{ - printf("\n prelu\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value({0, 1, 0, 3, 0, 16}, -9); - input0_s8.set_value({1, 2, 0, 3, 0, 16}, 18); - input0_s8.set_value({2, 3, 0, 3, 0, 16}, -27); - Tensor input1_s8; - input1_s8.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s8.set_value({0, 1, 0, 3, 0, 16}, 9); - input1_s8.set_value({1, 2, 0, 3, 0, 16}, -18); - input1_s8.set_value({2, 3, 0, 3, 0, 16}, 27); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value({0, 1, 0, 3, 0, 16}, -9); - input0_s16.set_value({1, 2, 0, 3, 0, 16}, 18); - input0_s16.set_value({2, 3, 0, 3, 0, 16}, -27); - Tensor input1_s16; - input1_s16.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s16.set_value({0, 1, 0, 3, 0, 16}, 9); - input1_s16.set_value({1, 2, 0, 3, 0, 16}, -18); - input1_s16.set_value({2, 3, 0, 3, 0, 16}, 27); - int8_t alpha_s8[16] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; - int16_t alpha_s16[16] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; - - PRelu _prelu_s8_1(alpha_s8, exponent0, "prelu_s8 inplace", true); - PRelu _prelu_s8_2(alpha_s8, exponent0, "prelu_s8 no inplace", false); - PRelu _prelu_s16_1(alpha_s16, exponent0, "prelu_s16 inplace", true); - PRelu _prelu_s16_2(alpha_s16, exponent0, "prelu_s16 no inplace", false); - - _prelu_s8_1.build(input0_s8, true); - _prelu_s8_2.build(input0_s8, true); - _prelu_s16_1.build(input0_s16, true); - _prelu_s16_2.build(input0_s16, true); - - _prelu_s8_1.call(input0_s8); - _prelu_s8_1.get_output().print({}, "prelu_s8 inplace"); - _prelu_s8_2.call(input1_s8); - _prelu_s8_2.get_output().print({}, "prelu_s8 no inplace"); - _prelu_s16_1.call(input0_s16); - _prelu_s16_1.get_output().print({}, "prelu_s16 inplace"); - _prelu_s16_2.call(input1_s16); - _prelu_s16_2.get_output().print({}, "prelu_s16 no inplace"); - - return true; -} - -bool test_concat_layer() -{ - printf("\n concat\n"); - Tensor input0_s8; - input0_s8.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s8.set_value(1); - Tensor input1_s8; - input1_s8.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s8.set_value(2); - Tensor input0_s16; - input0_s16.set_exponent(exponent0).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input0_s16.set_value(1); - Tensor input1_s16; - input1_s16.set_exponent(exponent1).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - input1_s16.set_value(2); - - Concat _concat_s8_1(-1, "concat_s8 inplace"); - Concat _concat_s8_2(-2, "concat_s8 no inplace"); - Concat _concat_s16_1(0, "concat_s16 inplace"); - Concat _concat_s16_2(1, "concat_s16 no inplace"); - - _concat_s8_1.build({&input0_s8, &input1_s8}, true); - _concat_s8_2.build({&input0_s8, &input1_s8}, true); - _concat_s16_1.build({&input0_s16, &input1_s16}, true); - _concat_s16_2.build({&input0_s16, &input1_s16}, true); - - _concat_s8_1.call({&input0_s8, &input1_s8}); - _concat_s8_1.get_output().print({}, "concat_s8 inplace"); - _concat_s8_2.call({&input0_s8, &input1_s8}); - _concat_s8_2.get_output().print({}, "concat_s8 no inplace"); - _concat_s16_1.call({&input0_s16, &input1_s16}); - _concat_s16_1.get_output().print({}, "concat_s16 inplace"); - _concat_s16_2.call({&input0_s16, &input1_s16}); - _concat_s16_2.get_output().print({}, "concat_s16 no inplace"); - - return true; -} - -TEST_CASE("test layer", "[add]") -{ - TEST_ASSERT(test_add_layer()); - TEST_ASSERT(test_sub_layer()); - TEST_ASSERT(test_mul_layer()); - TEST_ASSERT(test_expand_dims_layer()); - TEST_ASSERT(test_flatten_layer()); - TEST_ASSERT(test_squeeze_layer()); - TEST_ASSERT(test_reshape_layer()); - TEST_ASSERT(test_transpose_layer()); - TEST_ASSERT(test_pad_layer()); - TEST_ASSERT(test_min_layer()); - TEST_ASSERT(test_max_layer()); - TEST_ASSERT(test_global_max_pool_layer()); - TEST_ASSERT(test_global_avg_pool_layer()); - TEST_ASSERT(test_relu_layer()); - TEST_ASSERT(test_leaky_relu_layer()); - TEST_ASSERT(test_prelu_layer()); - TEST_ASSERT(test_concat_layer()); - printf("\n\n"); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_layer_sigmoid.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_layer_sigmoid.cpp deleted file mode 100644 index 0772ed3d..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_layer_sigmoid.cpp +++ /dev/null @@ -1,102 +0,0 @@ -#include "dl_layer_sigmoid.hpp" - -#include -#include - -#include "unity.h" - -#include "test_tool.hpp" - -template -std::unique_ptr sigmoid_float(const int output_exp, I *input_ptr, const int input_exp, const int size) -{ - std::unique_ptr ref(new O[size]); - - float scale = (input_exp > 0) ? (1 << input_exp) : ((float)1.0 / (1 << -input_exp)); - float rescale = (output_exp > 0) ? ((float)1.0 / (1 << output_exp)) : (1 << -output_exp); - - for (size_t i = 0; i < size; i++) { - float temp = exp((float)input_ptr[i] * scale); - temp = temp / (temp + 1); - - if constexpr (type == QIQO) - dl::tool::truncate(ref[i], temp * rescale); - else if constexpr (type == QIFO) - ref[i] = temp; - } - - return ref; -} - -template -bool testcase() -{ - // dl::tool::Latency latency; - - int input_exponent = -16; - int output_exponent = -14; - - int height = 5; - int width = 6; - int channel = 7; - - dl::Tensor input; - input.set_exponent(input_exponent).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - random_array(input.element, input.get_size()); - - // latency.start(); - std::unique_ptr ref = - sigmoid_float(output_exponent, input.element, input.exponent, input.get_size()); - // latency.end(); - // latency.print("float"); - - dl::layer::Sigmoid sigmoid(output_exponent); - sigmoid.build(input); - // latency.start(); - sigmoid.call(input); - // latency.end(); - // latency.print("quant"); - - return sigmoid.get_output().check_element(ref.get(), 2, false, 0); -} - -TEST_CASE("Sigmoid", "[dl::layer::Sigmoid]") -{ - bool ans = false; - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_layer_softmax.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_layer_softmax.cpp deleted file mode 100644 index c1b3f854..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_layer_softmax.cpp +++ /dev/null @@ -1,115 +0,0 @@ -#include "dl_layer_softmax.hpp" - -#include -#include - -#include "unity.h" - -#include "test_tool.hpp" - -template -std::unique_ptr softmax_float( - const int output_exp, I *input_ptr, const int input_exp, const uint32_t loop, const uint32_t channel) -{ - std::unique_ptr ref(new O[loop * channel]); - std::unique_ptr buf(new float[channel]); - - float scale = (input_exp > 0) ? (1 << input_exp) : ((float)1.0 / (1 << -input_exp)); - float rescale = (output_exp > 0) ? ((float)1.0 / (1 << output_exp)) : (1 << -output_exp); - - for (size_t i = 0; i < loop; i++) { - I max_input = input_ptr[0]; - for (size_t j = 1; j < channel; j++) max_input = DL_MAX(max_input, input_ptr[j]); - - float summary = 0.0; - for (size_t j = 0; j < channel; j++) { - buf[j] = exp(((float)input_ptr[j] - max_input) * scale); - summary += buf[j]; - } - - if constexpr (type == QIQO) { - summary = rescale / summary; - for (size_t j = 0; j < channel; j++) dl::tool::truncate(ref[i * channel + j], buf[j] * summary); - } else if constexpr (type == QIFO) { - summary = 1.0 / summary; - for (size_t j = 0; j < channel; j++) ref[i * channel + j] = buf[j] * summary; - } - - input_ptr += channel; - } - - return ref; -} - -template -bool testcase() -{ - // dl::tool::Latency latency; - - int input_exponent = -16; - int output_exponent = -15; - - int height = 5; - int width = 1; - int channel = 7; - - dl::Tensor input; - input.set_exponent(input_exponent).set_shape({height, width, channel}).malloc_element(); - random_array(input.element, input.get_size()); - - // latency.start(); - std::unique_ptr ref = - softmax_float(output_exponent, input.element, input.exponent, height * width, channel); - // latency.end(); - // latency.print("float"); - - dl::layer::Softmax softmax(output_exponent); - softmax.build(input); - // latency.start(); - softmax.call(input); - // latency.end(); - // latency.print("quant"); - - return softmax.get_output().check_element(ref.get(), 5, false, INT32_MAX); -} - -TEST_CASE("Softmax", "[dl::layer::Softmax]") -{ - bool ans = false; - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_layer_tanh.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_layer_tanh.cpp deleted file mode 100644 index 9fbfc4a5..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_layer_tanh.cpp +++ /dev/null @@ -1,101 +0,0 @@ -#include "dl_layer_tanh.hpp" - -#include -#include - -#include "unity.h" - -#include "test_tool.hpp" - -template -std::unique_ptr tanh_float(const int output_exp, I *input_ptr, const int input_exp, const int size) -{ - std::unique_ptr ref(new O[size]); - - float scale = DL_SCALE(input_exp + 1); - float rescale = DL_RESCALE(output_exp); - - for (size_t i = 0; i < size; i++) { - float temp = exp((float)input_ptr[i] * scale); - temp = (temp - 1.0f) / (temp + 1.0f); - - if constexpr (type == QIQO) - dl::tool::truncate(ref[i], temp * rescale); - else if constexpr (type == QIFO) - ref[i] = temp; - } - - return ref; -} - -template -bool testcase() -{ - dl::tool::Latency latency; - - int input_exponent = -14; - int output_exponent = -16; - - int height = 5; - int width = 6; - int channel = 7; - - dl::Tensor input; - input.set_exponent(input_exponent).set_shape({height, width, channel}).set_auto_free(true).malloc_element(); - random_array(input.element, input.get_size()); - - latency.start(); - std::unique_ptr ref = tanh_float(output_exponent, input.element, input.exponent, input.get_size()); - latency.end(); - latency.print("float"); - - dl::layer::TanH tanh(output_exponent); - tanh.build(input); - latency.start(); - tanh.call(input); - latency.end(); - latency.print("quant"); - - return tanh.get_output().check_element(ref.get(), 2, false, 100); -} - -TEST_CASE("TanH", "[dl::layer::TanH]") -{ - bool ans = false; - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); - - ans = testcase(); - TEST_ASSERT(ans); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_leakyrelu.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_leakyrelu.cpp deleted file mode 100644 index 3a4d27bf..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_leakyrelu.cpp +++ /dev/null @@ -1,256 +0,0 @@ -#include "test_data.hpp" - -#include "dl_base.hpp" -#include "dl_constant.hpp" -#include "dl_define.hpp" -#include "dl_nn_leakyrelu.hpp" -#include "dl_tool.hpp" -#include "dl_variable.hpp" - -#include "unity.h" -#include - -using namespace dl; -using namespace nn; -using namespace tool; -using namespace base; -using namespace std; - -template -void leakyrelu_c(Tensor &output, - Tensor &input, - const feature_t activation_alpha, - const int activation_exponent) -{ - int height = input.shape[0]; // inputs and output are the same shape - int width = input.shape[1]; - int channel = input.shape[2]; - - feature_t *input_element = input.get_element_ptr(); - int input_y_offset = input.shape[1] * input.shape[2]; - int input_x_offset = input.shape[2]; - - feature_t *output_element = output.get_element_ptr(); // output - int output_y_offset = output.shape[1] * output.shape[2]; - int output_x_offset = output.shape[2]; - - int buffer = 0; - - int activation_shift = -activation_exponent; - - for (size_t output_y = 0; output_y < height; output_y++) { - feature_t *input_11c = input_element; - feature_t *output_11c = output_element; - - for (size_t output_x = 0; output_x < width; output_x++) { - for (size_t output_c = 0; output_c < channel; output_c++) { - output_11c[output_c] = input_11c[output_c]; - if (output_11c[output_c] < 0) { - buffer = DL_RIGHT_SHIFT(output_11c[output_c] * activation_alpha, activation_shift); - tool::truncate(output_11c[output_c], buffer); - } - } - input_11c += input_x_offset; - output_11c += output_x_offset; - } - - input_element += input_y_offset; - output_element += output_y_offset; - } -} - -template -Tensor leakyrelu_c(Tensor &input, const feature_t activation_alpha, const int activation_exponent) -{ - Tensor output; - output.set_exponent(input.exponent).set_shape(input.shape).malloc_element(); - leakyrelu_c(output, input, activation_alpha, activation_exponent); - return output; -} - -bool test_leakyrelu_s8( - int exponent, int8_t activation_alpha, int offset0, int height, int width, int channel, bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - Tensor output_c = leakyrelu_c(input0, activation_alpha, exponent); - latency.start(); - Tensor output = leakyrelu(input0, activation_alpha, exponent); - latency.end(); - latency.print(); - // printf("output: %p\n", &output); - return output.check_element(output_c.get_element_ptr(), 0, false); - } else { - Tensor input0_tmp; - input0_tmp.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - - // output; - Tensor output_c = leakyrelu_c(input0_tmp, activation_alpha, exponent); - latency.start(); - leakyrelu(input0, activation_alpha, exponent); - latency.end(); - latency.print(); - - // printf("input: %p\n", &input0); - // input0.print(0, 1, 0, 1, 0, 6, "input0"); - // output_c.print(0, 1, 0, 1, 0, 6, "output_c"); - return input0.check_element(output_c.get_element_ptr(), 0, false); - } -} - -bool test_leakyrelu_s16( - int exponent, int16_t activation_alpha, int offset0, int height, int width, int channel, bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - Tensor output_c = leakyrelu_c(input0, activation_alpha, exponent); - latency.start(); - Tensor output = leakyrelu(input0, activation_alpha, exponent); - latency.end(); - latency.print(); - // printf("output: %p\n", &output); - return output.check_element(output_c.get_element_ptr(), 0, false); - } else { - Tensor input0_tmp; - input0_tmp.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - - // output; - Tensor output_c = leakyrelu_c(input0_tmp, activation_alpha, exponent); - latency.start(); - leakyrelu(input0, activation_alpha, exponent); - latency.end(); - latency.print(); - - // printf("input: %p\n", &input0); - // input0.print(0, 1, 0, 1, 0, 6, "input0"); - // output_c.print(0, 1, 0, 1, 0, 6, "output_c"); - return input0.check_element(output_c.get_element_ptr(), 0, false); - } -} - -// ---------------------------------------------------------------no-inplace------------------------------------------------------------------------------------ -TEST_CASE("test no inplace start", "[leakyrelu]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} -// s8 - -TEST_CASE("test_leakyrelu_s8, c=6", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s8(-8, 45, 0, 5, 7, 6, true)); -} - -TEST_CASE("test_leakyrelu_s8, c=16", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s8(-8, -67, 0, 5, 7, 16, false)); -} - -TEST_CASE("test_leakyrelu_s8, c=16", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s8(-8, 125, 3, 5, 7, 16, true)); -} - -TEST_CASE("test_leakyrelu_s8, c=35", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s8(-8, 0, 0, 5, 7, 35, false)); -} - -// s16 - -TEST_CASE("test_leakyrelu_s16, c=6", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s16(-16, 45, 0, 5, 7, 6, true)); -} - -TEST_CASE("test_leakyrelu_s16, c=16", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s16(-16, -18726, 0, 5, 7, 16, false)); -} - -TEST_CASE("test_leakyrelu_s16, c=16", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s16(-16, 23456, 3, 5, 7, 16, true)); -} - -TEST_CASE("test_leakyrelu_s16, c=35", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s16(-16, 0, 0, 5, 7, 35, false)); -} - -//---------------------------------------------------------------inplace------------------------------------------------------------------------------------ -TEST_CASE("test inplace start", "[leakyrelu]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -// s8 - -TEST_CASE("test_leakyrelu_s8, c=6", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s8(-8, 45, 0, 5, 7, 6, true)); -} - -TEST_CASE("test_leakyrelu_s8, c=16", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s8(-8, -67, 0, 5, 7, 16, true)); -} - -TEST_CASE("test_leakyrelu_s8, c=16", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s8(-8, 125, 3, 5, 7, 16, true)); -} - -TEST_CASE("test_leakyrelu_s8, c=35", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s8(-8, 0, 0, 5, 7, 35, true)); -} - -// s16 - -TEST_CASE("test_leakyrelu_s16, c=6", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s16(-16, 45, 0, 5, 7, 6, true)); -} - -TEST_CASE("test_leakyrelu_s16, c=16", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s16(-16, -18726, 0, 5, 7, 16, true)); -} - -TEST_CASE("test_leakyrelu_s16, c=16", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s16(-16, 23456, 3, 5, 7, 16, true)); -} - -TEST_CASE("test_leakyrelu_s16, c=35", "[leakyrelu]") -{ - TEST_ASSERT(test_leakyrelu_s16(-16, 0, 0, 5, 7, 35, true)); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_max.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_max.cpp deleted file mode 100644 index ccb5b2fc..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_max.cpp +++ /dev/null @@ -1,277 +0,0 @@ -#include "test_data.hpp" - -#include "dl_base.hpp" -#include "dl_constant.hpp" -#include "dl_define.hpp" -#include "dl_nn_max2d.hpp" -#include "dl_tool.hpp" -#include "dl_variable.hpp" - -#include "unity.h" -#include - -using namespace dl; -using namespace nn; -using namespace tool; -using namespace base; -using namespace std; - -template -void max2d_c(Tensor &output, Tensor &input0, Tensor &input1) -{ - int height = input0.shape[0]; // inputs and output are the same shape - int width = input0.shape[1]; - int channel = input0.shape[2]; - - feature_t *input0_element = input0.get_element_ptr(); - int input0_y_offset = input0.shape[1] * input0.shape[2]; - int input0_x_offset = input0.shape[2]; - - feature_t *input1_element = input1.get_element_ptr(); - int input1_y_offset = input1.shape[1] * input1.shape[2]; - int input1_x_offset = input1.shape[2]; - - feature_t *output_element = output.get_element_ptr(); // output - int output_y_offset = output.shape[1] * output.shape[2]; - int output_x_offset = output.shape[2]; - - for (size_t output_y = 0; output_y < height; output_y++) { - feature_t *input0_11c = input0_element; - feature_t *input1_11c = input1_element; - feature_t *output_11c = output_element; - - for (size_t output_x = 0; output_x < width; output_x++) { - for (size_t output_c = 0; output_c < channel; output_c++) { - output_11c[output_c] = DL_MAX(input0_11c[output_c], input1_11c[output_c]); - } - input0_11c += input0_x_offset; - input1_11c += input1_x_offset; - output_11c += output_x_offset; - } - - input0_element += input0_y_offset; - input1_element += input1_y_offset; - output_element += output_y_offset; - } -} - -template -Tensor max2d_c(Tensor &input0, Tensor &input1) -{ - assert(input0.is_same_shape(input1)); - assert(input0.exponent == input1.exponent); - - Tensor output; - output.set_exponent(input0.exponent).set_shape(input0.shape).malloc_element(); - max2d_c(output, input0, input1); - - return output; -} - -bool test_max_s8(int exponent, int offset0, int offset1, int height, int width, int channel, bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1; - input1.set_element((int8_t *)&input1_element[offset1]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - latency.start(); - Tensor output = max2d(input0, input1); - latency.end(); - latency.print(); - - Tensor output_c = max2d_c(input0, input1); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else { - Tensor input0_tmp; - input0_tmp.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1_tmp; - input1_tmp.set_element((int8_t *)&input1_element[offset1]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - Tensor input1(input1_tmp, true); - input1.set_auto_free(true); - - // output; - latency.start(); - max2d(input0, input1); - latency.end(); - latency.print(); - - Tensor output_c = max2d_c(input0, input1); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } -} - -bool test_max_s16(int exponent, int offset0, int offset1, int height, int width, int channel, bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1; - input1.set_element((int16_t *)&input1_element_s16[offset1]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - latency.start(); - Tensor output = max2d(input0, input1); - latency.end(); - latency.print(); - - Tensor output_c = max2d_c(input0, input1); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else { - Tensor input0_tmp; - input0_tmp.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1_tmp; - input1_tmp.set_element((int16_t *)&input1_element_s16[offset1]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - Tensor input1(input1_tmp, true); - input1.set_auto_free(true); - - // output; - latency.start(); - max2d(input0, input1); - latency.end(); - latency.print(); - - Tensor output_c = max2d_c(input0, input1); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } -} - -//---------------------------------------------------------------no-inplace------------------------------------------------------------------------------------ -TEST_CASE("test no inplace start", "[max]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -TEST_CASE("test_max2d_s8, c=6", "[max]") -{ - TEST_ASSERT(test_max_s8(-8, 0, 0, 5, 7, 6, false)); -} - -TEST_CASE("test_max2d_s8, c=16", "[max]") -{ - TEST_ASSERT(test_max_s8(-8, 0, 0, 5, 7, 16, false)); -} - -TEST_CASE("test_max2d_s8, c=16", "[max]") -{ - TEST_ASSERT(test_max_s8(-8, 4, 7, 5, 7, 16, false)); -} - -TEST_CASE("test_max2d_s8, c=35", "[max]") -{ - TEST_ASSERT(test_max_s8(-8, 0, 0, 5, 7, 35, false)); -} - -// s16 -TEST_CASE("test_max2d_s16, c=6", "[max]") -{ - TEST_ASSERT(test_max_s16(-8, 0, 0, 5, 7, 6, false)); -} - -TEST_CASE("test_max2d_s16, c=16", "[max]") -{ - TEST_ASSERT(test_max_s16(-8, 0, 0, 5, 7, 16, false)); -} - -TEST_CASE("test_max2d_s16, c=16", "[max]") -{ - TEST_ASSERT(test_max_s16(-8, 4, 7, 5, 7, 16, false)); -} - -TEST_CASE("test_max2d_s16, c=35", "[max]") -{ - TEST_ASSERT(test_max_s16(-8, 0, 0, 5, 7, 35, false)); -} - -//---------------------------------------------------------------inplace------------------------------------------------------------------------------------ -TEST_CASE("test inplace start", "[max]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -TEST_CASE("test_max2d_s8, c=6", "[max]") -{ - TEST_ASSERT(test_max_s8(-8, 0, 0, 5, 7, 6, true)); -} - -TEST_CASE("test_max2d_s8, c=16", "[max]") -{ - TEST_ASSERT(test_max_s8(-8, 0, 0, 5, 7, 16, true)); -} - -TEST_CASE("test_max2d_s8, c=16", "[max]") -{ - TEST_ASSERT(test_max_s8(-8, 4, 7, 5, 7, 16, true)); -} - -TEST_CASE("test_max2d_s8, c=35", "[max]") -{ - TEST_ASSERT(test_max_s8(-8, 0, 0, 5, 7, 35, true)); -} - -// s16 -TEST_CASE("test_max2d_s16, c=6", "[max]") -{ - TEST_ASSERT(test_max_s16(-8, 0, 0, 5, 7, 6, true)); -} - -TEST_CASE("test_max2d_s16, c=16", "[max]") -{ - TEST_ASSERT(test_max_s16(-8, 0, 0, 5, 7, 16, true)); -} - -TEST_CASE("test_max2d_s16, c=16", "[max]") -{ - TEST_ASSERT(test_max_s16(-8, 4, 7, 5, 7, 16, true)); -} - -TEST_CASE("test_max2d_s16, c=35", "[max]") -{ - TEST_ASSERT(test_max_s16(-8, 0, 0, 5, 7, 35, true)); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_min.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_min.cpp deleted file mode 100644 index 5bd3b9c3..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_min.cpp +++ /dev/null @@ -1,273 +0,0 @@ -#include "test_data.hpp" - -#include "dl_base.hpp" -#include "dl_constant.hpp" -#include "dl_define.hpp" -#include "dl_nn_min2d.hpp" -#include "dl_tool.hpp" -#include "dl_variable.hpp" - -#include "unity.h" -#include - -using namespace dl; -using namespace nn; -using namespace tool; -using namespace base; -using namespace std; - -template -void min2d_c(Tensor &output, Tensor &input0, Tensor &input1) -{ - int height = input0.shape[0]; // inputs and output are the same shape - int width = input0.shape[1]; - int channel = input0.shape[2]; - - feature_t *input0_element = input0.get_element_ptr(); - int input0_y_offset = input0.shape[1] * input0.shape[2]; - int input0_x_offset = input0.shape[2]; - - feature_t *input1_element = input1.get_element_ptr(); - int input1_y_offset = input1.shape[1] * input1.shape[2]; - int input1_x_offset = input1.shape[2]; - - feature_t *output_element = output.get_element_ptr(); // output - int output_y_offset = output.shape[1] * output.shape[2]; - int output_x_offset = output.shape[2]; - - for (size_t output_y = 0; output_y < height; output_y++) { - feature_t *input0_11c = input0_element; - feature_t *input1_11c = input1_element; - feature_t *output_11c = output_element; - - for (size_t output_x = 0; output_x < width; output_x++) { - for (size_t output_c = 0; output_c < channel; output_c++) { - output_11c[output_c] = DL_MIN(input0_11c[output_c], input1_11c[output_c]); - } - input0_11c += input0_x_offset; - input1_11c += input1_x_offset; - output_11c += output_x_offset; - } - - input0_element += input0_y_offset; - input1_element += input1_y_offset; - output_element += output_y_offset; - } -} - -template -Tensor min2d_c(Tensor &input0, Tensor &input1) -{ - assert(input0.is_same_shape(input1)); - assert(input0.exponent == input1.exponent); - - Tensor output; - output.set_exponent(input0.exponent).set_shape(input0.shape).malloc_element(); - min2d_c(output, input0, input1); - - return output; -} - -bool test_min_s8(int exponent, int offset0, int offset1, int height, int width, int channel, bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1; - input1.set_element((int8_t *)&input1_element[offset1]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - Tensor output_c = min2d_c(input0, input1); - latency.start(); - Tensor output = min2d(input0, input1); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else { - Tensor input0_tmp; - input0_tmp.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1_tmp; - input1_tmp.set_element((int8_t *)&input1_element[offset1]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - Tensor input1(input0_tmp, true); - input1.set_auto_free(true); - - // output; - Tensor output_c = min2d_c(input0, input1); - latency.start(); - min2d(input0, input1); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } -} - -bool test_min_s16(int exponent, int offset0, int offset1, int height, int width, int channel, bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1; - input1.set_element((int16_t *)&input1_element_s16[offset1]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - Tensor output_c = min2d_c(input0, input1); - latency.start(); - Tensor output = min2d(input0, input1); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else { - Tensor input0_tmp; - input0_tmp.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1_tmp; - input1_tmp.set_element((int16_t *)&input1_element_s16[offset1]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - Tensor input1(input1_tmp, true); - input1.set_auto_free(true); - - // output; - Tensor output_c = min2d_c(input0, input1); - latency.start(); - min2d(input0, input1); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } -} - -//---------------------------------------------------------------no-inplace------------------------------------------------------------------------------------ -TEST_CASE("test no inplace start", "[min]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -TEST_CASE("test_min2d_s8, c=6", "[min]") -{ - TEST_ASSERT(test_min_s8(-8, 0, 0, 5, 7, 6, false)); -} - -TEST_CASE("test_min2d_s8, c=16", "[min]") -{ - TEST_ASSERT(test_min_s8(-8, 0, 0, 5, 7, 16, false)); -} - -TEST_CASE("test_min2d_s8, c=16", "[min]") -{ - TEST_ASSERT(test_min_s8(-8, 4, 7, 5, 7, 16, false)); -} - -TEST_CASE("test_min2d_s8, c=35", "[min]") -{ - TEST_ASSERT(test_min_s8(-8, 0, 0, 5, 7, 35, false)); -} - -// s16 -TEST_CASE("test_min2d_s16, c=6", "[min]") -{ - TEST_ASSERT(test_min_s16(-8, 0, 0, 5, 7, 6, false)); -} - -TEST_CASE("test_min2d_s16, c=16", "[min]") -{ - TEST_ASSERT(test_min_s16(-8, 0, 0, 5, 7, 16, false)); -} - -TEST_CASE("test_min2d_s16, c=16", "[min]") -{ - TEST_ASSERT(test_min_s16(-8, 4, 7, 5, 7, 16, false)); -} - -TEST_CASE("test_min2d_s16, c=35", "[min]") -{ - TEST_ASSERT(test_min_s16(-8, 0, 0, 5, 7, 35, false)); -} - -//---------------------------------------------------------------inplace------------------------------------------------------------------------------------ -TEST_CASE("test inplace start", "[min]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -TEST_CASE("test_min2d_s8, c=6", "[min]") -{ - TEST_ASSERT(test_min_s8(-8, 0, 0, 5, 7, 6, true)); -} - -TEST_CASE("test_min2d_s8, c=16", "[min]") -{ - TEST_ASSERT(test_min_s8(-8, 0, 0, 5, 7, 16, true)); -} - -TEST_CASE("test_min2d_s8, c=16", "[min]") -{ - TEST_ASSERT(test_min_s8(-8, 4, 7, 5, 7, 16, true)); -} - -TEST_CASE("test_min2d_s8, c=35", "[min]") -{ - TEST_ASSERT(test_min_s8(-8, 0, 0, 5, 7, 35, true)); -} - -// s16 -TEST_CASE("test_min2d_s16, c=6", "[min]") -{ - TEST_ASSERT(test_min_s16(-8, 0, 0, 5, 7, 6, true)); -} - -TEST_CASE("test_min2d_s16, c=16", "[min]") -{ - TEST_ASSERT(test_min_s16(-8, 0, 0, 5, 7, 16, true)); -} - -TEST_CASE("test_min2d_s16, c=16", "[min]") -{ - TEST_ASSERT(test_min_s16(-8, 4, 7, 5, 7, 16, true)); -} - -TEST_CASE("test_min2d_s16, c=35", "[min]") -{ - TEST_ASSERT(test_min_s16(-8, 0, 0, 5, 7, 35, true)); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_mul.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_mul.cpp deleted file mode 100644 index 2fbd23e9..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_mul.cpp +++ /dev/null @@ -1,625 +0,0 @@ -#include "test_data.hpp" - -#include "dl_base.hpp" -#include "dl_constant.hpp" -#include "dl_define.hpp" -#include "dl_nn_mul2d.hpp" -#include "dl_tool.hpp" -#include "dl_variable.hpp" - -#include "unity.h" -#include - -using namespace dl; -using namespace nn; -using namespace tool; -using namespace base; -using namespace std; - -template -void mul2d_c(Tensor &output, - Tensor &input0, - Tensor &input1, - const Activation *const activation) -{ - int height = input0.shape[0]; // inputs and output are the same shape - int width = input0.shape[1]; - int channel = input0.shape[2]; - - feature_t *input0_element = input0.get_element_ptr(); - int input0_y_offset = input0.shape[1] * input0.shape[2]; - int input0_x_offset = input0.shape[2]; - - feature_t *input1_element = input1.get_element_ptr(); - int input1_y_offset = input1.shape[1] * input1.shape[2]; - int input1_x_offset = input1.shape[2]; - - feature_t *output_element = output.get_element_ptr(); // output - int output_y_offset = output.shape[1] * output.shape[2]; - int output_x_offset = output.shape[2]; - - int buffer = 0; - int mul_shift = output.exponent - input0.exponent - input1.exponent; - - activation_type_t activation_type = activation ? activation->type : Linear; - feature_t activation_alpha; - int activation_shift; - const feature_t *activation_alpha_ptr; - - switch (activation_type) { - case ReLU: - activation_alpha = 0; - activation_shift = 0; - activation_alpha_ptr = NULL; - break; - case LeakyReLU: - activation_alpha = activation->element[0]; - activation_shift = -activation->exponent; - activation_alpha_ptr = NULL; - break; - case PReLU: - activation_alpha = 0; - activation_alpha_ptr = activation->element; - activation_shift = -activation->exponent; - break; - default: - activation_alpha = 0; - activation_alpha_ptr = NULL; - activation_shift = -1; - break; - } - - for (size_t output_y = 0; output_y < height; output_y++) { - feature_t *input0_11c = input0_element; - feature_t *input1_11c = input1_element; - feature_t *output_11c = output_element; - - for (size_t output_x = 0; output_x < width; output_x++) { - for (size_t output_c = 0; output_c < channel; output_c++) { - buffer = (int)input0_11c[output_c] * (int)input1_11c[output_c]; - buffer = DL_RIGHT_SHIFT(buffer, mul_shift); - tool::truncate(output_11c[output_c], buffer); - if (activation_type == ReLU) { - output_11c[output_c] = DL_MAX(0, output_11c[output_c]); - } else if (activation_type == LeakyReLU) { - if (output_11c[output_c] < 0) { - buffer = DL_RIGHT_SHIFT((output_11c[output_c] * activation_alpha), activation_shift); - tool::truncate(output_11c[output_c], buffer); - } - } else if (activation_type == PReLU) { - if (output_11c[output_c] < 0) { - buffer = - DL_RIGHT_SHIFT((output_11c[output_c] * activation_alpha_ptr[output_c]), activation_shift); - tool::truncate(output_11c[output_c], buffer); - } - } - } - input0_11c += input0_x_offset; - input1_11c += input1_x_offset; - output_11c += output_x_offset; - } - - input0_element += input0_y_offset; - input1_element += input1_y_offset; - output_element += output_y_offset; - } -} - -template -Tensor mul2d_c(const int output_exponent, - Tensor &input0, - Tensor &input1, - const Activation *activation, - const std::vector &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -{ - assert(input0.is_same_shape(input1)); - - Tensor output; - output.set_exponent(output_exponent).set_shape(input0.shape).malloc_element(); - mul2d_c(output, input0, input1, activation); - - return output; -} - -bool test_mul_s8(int exponent0, - int exponent1, - int exponent_out, - int offset0, - int offset1, - int height, - int width, - int channel, - int activation_type, - bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent0) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1; - input1.set_element((int8_t *)&input1_element[offset1]) - .set_exponent(exponent1) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - if (activation_type == 0) { - Tensor output_c = mul2d_c(exponent_out, input0, input1, (Activation *)NULL); - latency.start(); - Tensor output = mul2d( - exponent_out, input0, input1, (Activation *)NULL); //(Activation *)NULL - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 1) { // relu - Tensor output_c = mul2d_c(exponent_out, input0, input1, &layer_activation_relu); - latency.start(); - Tensor output = mul2d(exponent_out, input0, input1, &layer_activation_relu); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 2) { // leakyrelu - Tensor output_c = mul2d_c(exponent_out, input0, input1, &layer_activation_lrelu); - latency.start(); - Tensor output = mul2d(exponent_out, input0, input1, &layer_activation_lrelu); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 3) { // prelu - Tensor output_c = mul2d_c(exponent_out, input0, input1, &layer_activation_prelu); - latency.start(); - Tensor output = mul2d(exponent_out, input0, input1, &layer_activation_prelu); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } - return false; - } else { - Tensor input0_tmp; - input0_tmp.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent0) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1_tmp; - input1_tmp.set_element((int8_t *)&input1_element[offset1]) - .set_exponent(exponent1) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - Tensor input1(input1_tmp, true); - input1.set_auto_free(true); - - // output; - if (activation_type == 0) { - Tensor output_c = mul2d_c(exponent_out, input0, input1, (Activation *)NULL); - latency.start(); - mul2d(exponent_out, input0, input1, (Activation *)NULL); //(Activation *)NULL - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 1) { // relu - Tensor output_c = mul2d_c(exponent_out, input0, input1, &layer_activation_relu); - latency.start(); - mul2d(exponent_out, input0, input1, &layer_activation_relu); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 2) { // leakyrelu - Tensor output_c = mul2d_c(exponent_out, input0, input1, &layer_activation_lrelu); - latency.start(); - mul2d(exponent_out, input0, input1, &layer_activation_lrelu); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 3) { // prelu - Tensor output_c = mul2d_c(exponent_out, input0, input1, &layer_activation_prelu); - latency.start(); - mul2d(exponent_out, input0, input1, &layer_activation_prelu); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } - return false; - } -} - -bool test_mul_s16(int exponent0, - int exponent1, - int exponent_out, - int offset0, - int offset1, - int height, - int width, - int channel, - int activation_type, - bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent0) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1; - input1.set_element((int16_t *)&input1_element_s16[offset1]) - .set_exponent(exponent1) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - if (activation_type == 0) { - Tensor output_c = mul2d_c(exponent_out, input0, input1, (Activation *)NULL); - latency.start(); - Tensor output = mul2d( - exponent_out, input0, input1, (Activation *)NULL); //(Activation *)NULL - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 1) { // relu - Tensor output_c = mul2d_c(exponent_out, input0, input1, &layer_activation_relu_s16); - latency.start(); - Tensor output = mul2d(exponent_out, input0, input1, &layer_activation_relu_s16); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 2) { // leakyrelu - Tensor output_c = mul2d_c(exponent_out, input0, input1, &layer_activation_lrelu_s16); - latency.start(); - Tensor output = mul2d(exponent_out, input0, input1, &layer_activation_lrelu_s16); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 3) { // prelu - Tensor output_c = mul2d_c(exponent_out, input0, input1, &layer_activation_prelu_s16); - latency.start(); - Tensor output = mul2d(exponent_out, input0, input1, &layer_activation_prelu_s16); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } - return false; - } else { - Tensor input0_tmp; - input0_tmp.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent0) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1_tmp; - input1_tmp.set_element((int16_t *)&input1_element_s16[offset1]) - .set_exponent(exponent1) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - Tensor input1(input1_tmp, true); - input1.set_auto_free(true); - - // output; - if (activation_type == 0) { - Tensor output_c = mul2d_c(exponent_out, input0, input1, (Activation *)NULL); - latency.start(); - mul2d( - exponent_out, input0, input1, (Activation *)NULL); //(Activation *)NULL - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 1) { // relu - Tensor output_c = mul2d_c(exponent_out, input0, input1, &layer_activation_relu_s16); - latency.start(); - mul2d(exponent_out, input0, input1, &layer_activation_relu_s16); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 2) { // leakyrelu - Tensor output_c = mul2d_c(exponent_out, input0, input1, &layer_activation_lrelu_s16); - latency.start(); - mul2d(exponent_out, input0, input1, &layer_activation_lrelu_s16); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 3) { // prelu - Tensor output_c = mul2d_c(exponent_out, input0, input1, &layer_activation_prelu_s16); - latency.start(); - mul2d(exponent_out, input0, input1, &layer_activation_prelu_s16); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } - return false; - } -} - -//---------------------------------------------------------------no-inplace------------------------------------------------------------------------------------ -TEST_CASE("test no inplace start", "[mul]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -// s8, c = 6 -TEST_CASE("test_mul2d, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 6, 0, false)); -} - -TEST_CASE("test_mul2d_relu, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 6, 1, false)); -} - -TEST_CASE("test_mul2d_leakyrelu, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_mul2d_prelu, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 6, 3, false)); -} - -// s8, c = 16 -TEST_CASE("test_mul2d, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 16, 0, false)); -} - -TEST_CASE("test_mul2d_relu, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 16, 1, false)); -} - -TEST_CASE("test_mul2d_leakyrelu, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 16, 2, false)); -} - -TEST_CASE("test_mul2d_prelu, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 16, 3, false)); -} - -// s8, c = 35 -TEST_CASE("test_mul2d, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_mul2d_relu, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 35, 1, false)); -} - -TEST_CASE("test_mul2d_leakyrelu, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 35, 2, false)); -} - -TEST_CASE("test_mul2d_prelu, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 35, 3, false)); -} - -// s16, c = 6 -TEST_CASE("test_mul2d, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 6, 0, false)); -} - -TEST_CASE("test_mul2d_relu, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 6, 1, false)); -} - -TEST_CASE("test_mul2d_leakyrelu, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_mul2d_prelu, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 6, 3, false)); -} - -// s16, c = 16 -TEST_CASE("test_mul2d, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 16, 0, false)); -} - -TEST_CASE("test_mul2d_relu, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 16, 1, false)); -} - -TEST_CASE("test_mul2d_leakyrelu, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 16, 2, false)); -} - -TEST_CASE("test_mul2d_prelu, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 16, 3, false)); -} - -// s16, c = 35 -TEST_CASE("test_mul2d, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_mul2d_relu, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 35, 1, false)); -} - -TEST_CASE("test_mul2d_leakyrelu, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 35, 2, false)); -} - -TEST_CASE("test_mul2d_prelu, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 35, 3, false)); -} - -//---------------------------------------------------------------inplace------------------------------------------------------------------------------------ -TEST_CASE("test inplace start", "[min]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -// s8, c = 6 -TEST_CASE("test_mul2d, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 6, 0, true)); -} - -TEST_CASE("test_mul2d_relu, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 6, 1, true)); -} - -TEST_CASE("test_mul2d_leakyrelu, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_mul2d_prelu, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 6, 3, true)); -} - -// s8, c = 16 -TEST_CASE("test_mul2d, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 16, 0, true)); -} - -TEST_CASE("test_mul2d_relu, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 16, 1, true)); -} - -TEST_CASE("test_mul2d_leakyrelu, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 16, 2, true)); -} - -TEST_CASE("test_mul2d_prelu, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 16, 3, true)); -} - -// s8, c = 35 -TEST_CASE("test_mul2d, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_mul2d_relu, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 35, 1, true)); -} - -TEST_CASE("test_mul2d_leakyrelu, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 35, 2, true)); -} - -TEST_CASE("test_mul2d_prelu, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s8(-8, -8, -8, 4, 7, 5, 7, 35, 3, true)); -} - -// s16, c = 6 -TEST_CASE("test_mul2d, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 6, 0, true)); -} - -TEST_CASE("test_mul2d_relu, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 6, 1, true)); -} - -TEST_CASE("test_mul2d_leakyrelu, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_mul2d_prelu, c=6", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 6, 3, true)); -} - -// s16, c = 16 -TEST_CASE("test_mul2d, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 16, 0, true)); -} - -TEST_CASE("test_mul2d_relu, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 16, 1, true)); -} - -TEST_CASE("test_mul2d_leakyrelu, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 16, 2, true)); -} - -TEST_CASE("test_mul2d_prelu, c=16", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 16, 3, true)); -} - -// s16, c = 35 -TEST_CASE("test_mul2d, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_mul2d_relu, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 35, 1, true)); -} - -TEST_CASE("test_mul2d_leakyrelu, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 35, 2, true)); -} - -TEST_CASE("test_mul2d_prelu, c=35", "[mul]") -{ - TEST_ASSERT(test_mul_s16(-8, -8, -8, 4, 7, 5, 7, 35, 3, true)); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_pad.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_pad.cpp deleted file mode 100644 index fdd3119e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_pad.cpp +++ /dev/null @@ -1,73 +0,0 @@ -#include "unity.h" -#include -#include - -#include "esp_system.h" -#include "freertos/FreeRTOS.h" -#include "freertos/task.h" - -#include "dl_nn_pad.hpp" -#include "dl_variable.hpp" - -#include "dl_layer_max2d.hpp" -#include "dl_layer_pad.hpp" - -using namespace dl; -using namespace std; - -TEST_CASE("PAD", "shape") -{ - printf("--------------------------------------------\n"); - Tensor x = Tensor::arange(2 * 3 * 4); - x.reshape({2, 3, 4}).print({}, "x"); - Tensor out; - out = nn::pad(x, {1, 0, 1, 2, 0, 1}, {-1, -2, -3, -4, -5, -6}, PADDING_CONSTANT); - out.print({}, "PADDING_CONSTANT 1"); - - out = nn::pad(x, {0, 1, 2, 1, 1, 0}, {}, PADDING_EDGE); - out.print({}, "PADDING_EDGE 1"); - - out = nn::pad(x, {0, 0, 2, 1, 1, 0}, {}, PADDING_EDGE); - out.print({}, "PADDING_EDGE 2"); - - out = nn::pad(x, {0, 0, 0, 1, 0, 0}, {}, PADDING_EDGE); - out.print({}, "PADDING_EDGE 3"); - - out = nn::pad(x, {1, 1, 1, 2, 1, 1}, {}, PADDING_REFLECT); - out.print({}, "PADDING_REFLECT 1"); - - out = nn::pad(x, {1, 0, 1, 2, 0, 1}, {}, PADDING_REFLECT); - out.print({}, "PADDING_REFLECT 2"); - - out = nn::pad(x, {1, 2, 1, 1, 1, 1}, {}, PADDING_SYMMETRIC); - out.print({}, "PADDING_SYMMETRIC 1"); - - out = nn::pad(x, {1, 0, 1, 2, 0, 1}, {}, PADDING_SYMMETRIC); - out.print({}, "PADDING_SYMMETRIC 2"); - - x.expand_dims({1}); - out = nn::pad(x, {1, 0, 1, 2, 0, 1, 1, 1}, {}, PADDING_REFLECT); - out.print({}, "PADDING_REFLECT 3"); - - out = nn::pad(x, {0, 1, 1, 0, 1, 2, 0, 1}, {}, PADDING_SYMMETRIC); - out.print({}, "PADDING_SYMMETRIC 3"); - - x.flatten().expand_dims({0}); - out = nn::pad(x, {1, 2, 0, 1}, {}, PADDING_REFLECT); - out.print({}, "PADDING_REFLECT 4"); - - out = nn::pad(x, {0, 1, 0, 0}, {}, PADDING_REFLECT); - out.print({}, "PADDING_REFLECT 5"); - - out = nn::pad(x, {2, 1, 1, 0}, {}, PADDING_SYMMETRIC); - out.print({}, "PADDING_SYMMETRIC 4"); - - out = nn::pad(x, {1, 0, 2, 2}, {}, PADDING_SYMMETRIC); - out.print({}, "PADDING_SYMMETRIC 5"); - printf("\n\n++++++++++++++++++++++++++++++++++++++++++++\n\n"); - - dl::layer::Pad padlayer({1}, {-1}); - padlayer.build(x.reshape({2, 3, -1}), true); - Tensor &output = padlayer.call(x); - output.print({}, "layer"); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_prelu.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_prelu.cpp deleted file mode 100644 index 1e8fd2c6..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_prelu.cpp +++ /dev/null @@ -1,251 +0,0 @@ -#include "test_data.hpp" - -#include "dl_base.hpp" -#include "dl_constant.hpp" -#include "dl_define.hpp" -#include "dl_nn_prelu.hpp" -#include "dl_tool.hpp" -#include "dl_variable.hpp" - -#include "unity.h" -#include - -using namespace dl; -using namespace nn; -using namespace tool; -using namespace base; -using namespace std; - -template -void prelu_c(Tensor &output, - Tensor &input, - const feature_t *activation_element, - const int activation_exponent) -{ - int height = input.shape[0]; // inputs and output are the same shape - int width = input.shape[1]; - int channel = input.shape[2]; - - feature_t *input_element = input.get_element_ptr(); - int input_y_offset = input.shape[1] * input.shape[2]; - int input_x_offset = input.shape[2]; - - feature_t *output_element = output.get_element_ptr(); // output - int output_y_offset = output.shape[1] * output.shape[2]; - int output_x_offset = output.shape[2]; - - int buffer = 0; - - int activation_shift = -activation_exponent; - const feature_t *activation_alpha_ptr = activation_element; - - for (size_t output_y = 0; output_y < height; output_y++) { - feature_t *input_11c = input_element; - feature_t *output_11c = output_element; - - for (size_t output_x = 0; output_x < width; output_x++) { - for (size_t output_c = 0; output_c < channel; output_c++) { - output_11c[output_c] = input_11c[output_c]; - if (output_11c[output_c] < 0) { - buffer = DL_RIGHT_SHIFT(output_11c[output_c] * activation_alpha_ptr[output_c], activation_shift); - tool::truncate(output_11c[output_c], buffer); - } - } - input_11c += input_x_offset; - output_11c += output_x_offset; - } - - input_element += input_y_offset; - output_element += output_y_offset; - } -} - -template -Tensor prelu_c(Tensor &input, const feature_t *activation_element, const int activation_exponent) -{ - Tensor output; - output.set_exponent(input.exponent).set_shape(input.shape).malloc_element(); - prelu_c(output, input, activation_element, activation_exponent); - return output; -} - -bool test_prelu_s8(int exponent, int offset0, int height, int width, int channel, bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - Tensor output_c = prelu_c(input0, layer_activation_prelu_element, exponent); - latency.start(); - Tensor output = prelu(input0, layer_activation_prelu_element, exponent); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else { - Tensor input0_tmp; - input0_tmp.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - - // output; - Tensor output_c = prelu_c(input0, layer_activation_prelu_element, exponent); - latency.start(); - prelu(input0, layer_activation_prelu_element, exponent); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } -} - -bool test_prelu_s16(int exponent, int offset0, int height, int width, int channel, bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - Tensor output_c = prelu_c(input0, layer_activation_prelu_element_s16, exponent); - latency.start(); - Tensor output = prelu(input0, layer_activation_prelu_element_s16, exponent); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else { - Tensor input0_tmp; - input0_tmp.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - - // output; - Tensor output_c = prelu_c(input0, layer_activation_prelu_element_s16, exponent); - latency.start(); - prelu(input0, layer_activation_prelu_element_s16, exponent); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } -} - -//---------------------------------------------------------------no-inplace------------------------------------------------------------------------------------ -TEST_CASE("test no inplace start", "[prelu]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} -// s8 - -TEST_CASE("test_prelu_s8, c=6", "[prelu]") -{ - TEST_ASSERT(test_prelu_s8(-8, 0, 5, 7, 6, false)); -} - -TEST_CASE("test_prelu_s8, c=16", "[prelu]") -{ - TEST_ASSERT(test_prelu_s8(-8, 0, 5, 7, 16, false)); -} - -TEST_CASE("test_prelu_s8, c=16", "[prelu]") -{ - TEST_ASSERT(test_prelu_s8(-8, 5, 5, 7, 16, false)); -} - -TEST_CASE("test_prelu_s8, c=35", "[prelu]") -{ - TEST_ASSERT(test_prelu_s8(-8, 0, 5, 7, 35, false)); -} - -// s16 - -TEST_CASE("test_prelu_s16, c=6", "[prelu]") -{ - TEST_ASSERT(test_prelu_s16(-16, 0, 5, 7, 6, false)); -} - -TEST_CASE("test_prelu_s16, c=16", "[prelu]") -{ - TEST_ASSERT(test_prelu_s16(-16, 0, 5, 7, 16, false)); -} - -TEST_CASE("test_prelu_s16, c=16", "[prelu]") -{ - TEST_ASSERT(test_prelu_s16(-16, 5, 5, 7, 16, false)); -} - -TEST_CASE("test_prelu_s16, c=35", "[prelu]") -{ - TEST_ASSERT(test_prelu_s16(-16, 0, 5, 7, 35, false)); -} - -//---------------------------------------------------------------inplace------------------------------------------------------------------------------------ -TEST_CASE("test inplace start", "[prelu]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -// s8 - -TEST_CASE("test_prelu_s8, c=6", "[prelu]") -{ - TEST_ASSERT(test_prelu_s8(-8, 0, 5, 7, 6, true)); -} - -TEST_CASE("test_prelu_s8, c=16", "[prelu]") -{ - TEST_ASSERT(test_prelu_s8(-8, 0, 5, 7, 16, true)); -} - -TEST_CASE("test_prelu_s8, c=16", "[prelu]") -{ - TEST_ASSERT(test_prelu_s8(-8, 5, 5, 7, 16, true)); -} - -TEST_CASE("test_prelu_s8, c=35", "[prelu]") -{ - TEST_ASSERT(test_prelu_s8(-8, 0, 5, 7, 35, true)); -} - -// s16 - -TEST_CASE("test_prelu_s16, c=6", "[prelu]") -{ - TEST_ASSERT(test_prelu_s16(-16, 0, 5, 7, 6, true)); -} - -TEST_CASE("test_prelu_s16, c=16", "[prelu]") -{ - TEST_ASSERT(test_prelu_s16(-16, 0, 5, 7, 16, true)); -} - -TEST_CASE("test_prelu_s16, c=16", "[prelu]") -{ - TEST_ASSERT(test_prelu_s16(-16, 5, 5, 7, 16, true)); -} - -TEST_CASE("test_prelu_s16, c=35", "[prelu]") -{ - TEST_ASSERT(test_prelu_s16(-16, 0, 5, 7, 35, true)); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_relu.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_relu.cpp deleted file mode 100644 index efa0fde7..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_relu.cpp +++ /dev/null @@ -1,232 +0,0 @@ -#include "test_data.hpp" - -#include "dl_base.hpp" -#include "dl_constant.hpp" -#include "dl_define.hpp" -#include "dl_nn_relu.hpp" -#include "dl_tool.hpp" -#include "dl_variable.hpp" - -#include "unity.h" -#include - -using namespace dl; -using namespace nn; -using namespace tool; -using namespace base; -using namespace std; - -template -void relu_c(Tensor &output, Tensor &input) -{ - int height = input.shape[0]; // inputs and output are the same shape - int width = input.shape[1]; - int channel = input.shape[2]; - - feature_t *input_element = input.get_element_ptr(); - int input_y_offset = input.shape[1] * input.shape[2]; - int input_x_offset = input.shape[2]; - - feature_t *output_element = output.get_element_ptr(); // output - int output_y_offset = output.shape[1] * output.shape[2]; - int output_x_offset = output.shape[2]; - - for (size_t output_y = 0; output_y < height; output_y++) { - feature_t *input_11c = input_element; - feature_t *output_11c = output_element; - - for (size_t output_x = 0; output_x < width; output_x++) { - for (size_t output_c = 0; output_c < channel; output_c++) { - output_11c[output_c] = DL_MAX(0, input_11c[output_c]); - } - input_11c += input_x_offset; - output_11c += output_x_offset; - } - - input_element += input_y_offset; - output_element += output_y_offset; - } -} - -template -Tensor relu_c(Tensor &input) -{ - Tensor output; - output.set_exponent(input.exponent).set_shape(input.shape).malloc_element(); - relu_c(output, input); - return output; -} - -bool test_relu_s8(int exponent, int offset0, int height, int width, int channel, bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - Tensor output_c = relu_c(input0); - latency.start(); - Tensor output = relu(input0); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else { - Tensor input0_tmp; - input0_tmp.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - - // output; - Tensor output_c = relu_c(input0); - latency.start(); - relu(input0); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } -} - -bool test_relu_s16(int exponent, int offset0, int height, int width, int channel, bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - Tensor output_c = relu_c(input0); - latency.start(); - Tensor output = relu(input0); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else { - Tensor input0_tmp; - input0_tmp.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - - // output; - Tensor output_c = relu_c(input0); - latency.start(); - relu(input0); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } -} - -//---------------------------------------------------------------no-inplace------------------------------------------------------------------------------------ -TEST_CASE("test no inplace start", "[relu]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -TEST_CASE("test_relu_s8, c=6", "[relu]") -{ - TEST_ASSERT(test_relu_s8(-8, 0, 5, 7, 6, false)); -} - -TEST_CASE("test_relu_s8, c=16", "[relu]") -{ - TEST_ASSERT(test_relu_s8(-8, 0, 5, 7, 16, false)); -} - -TEST_CASE("test_relu_s8, c=16", "[relu]") -{ - TEST_ASSERT(test_relu_s8(-8, 5, 5, 7, 16, false)); -} - -TEST_CASE("test_relu_s8, c=35", "[relu]") -{ - TEST_ASSERT(test_relu_s8(-8, 0, 5, 7, 35, false)); -} - -TEST_CASE("test_relu_s16, c=6", "[relu]") -{ - TEST_ASSERT(test_relu_s16(-8, 0, 5, 7, 6, false)); -} - -TEST_CASE("test_relu_s16, c=16", "[relu]") -{ - TEST_ASSERT(test_relu_s16(-8, 0, 5, 7, 16, false)); -} - -TEST_CASE("test_relu_s16, c=16", "[relu]") -{ - TEST_ASSERT(test_relu_s16(-8, 5, 5, 7, 16, false)); -} - -TEST_CASE("test_relu_s16, c=35", "[relu]") -{ - TEST_ASSERT(test_relu_s16(-8, 0, 5, 7, 35, false)); -} - -//---------------------------------------------------------------inplace------------------------------------------------------------------------------------ -TEST_CASE("test inplace start", "[relu]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -TEST_CASE("test_relu_s8, c=6", "[relu]") -{ - TEST_ASSERT(test_relu_s8(-8, 0, 5, 7, 6, true)); -} - -TEST_CASE("test_relu_s8, c=16", "[relu]") -{ - TEST_ASSERT(test_relu_s8(-8, 0, 5, 7, 16, true)); -} - -TEST_CASE("test_relu_s8, c=16", "[relu]") -{ - TEST_ASSERT(test_relu_s8(-8, 5, 5, 7, 16, true)); -} - -TEST_CASE("test_relu_s8, c=35", "[relu]") -{ - TEST_ASSERT(test_relu_s8(-8, 0, 5, 7, 35, true)); -} - -TEST_CASE("test_relu_s16, c=6", "[relu]") -{ - TEST_ASSERT(test_relu_s16(-8, 0, 5, 7, 6, true)); -} - -TEST_CASE("test_relu_s16, c=16", "[relu]") -{ - TEST_ASSERT(test_relu_s16(-8, 0, 5, 7, 16, true)); -} - -TEST_CASE("test_relu_s16, c=16", "[relu]") -{ - TEST_ASSERT(test_relu_s16(-8, 5, 5, 7, 16, true)); -} - -TEST_CASE("test_relu_s16, c=35", "[relu]") -{ - TEST_ASSERT(test_relu_s16(-8, 0, 5, 7, 35, true)); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_result.png b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_result.png deleted file mode 100644 index 33d2aca7..00000000 Binary files a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_result.png and /dev/null differ diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_sub.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_sub.cpp deleted file mode 100644 index db20a5c7..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_sub.cpp +++ /dev/null @@ -1,937 +0,0 @@ -#include "test_data.hpp" - -#include "dl_base.hpp" -#include "dl_constant.hpp" -#include "dl_define.hpp" -#include "dl_nn_sub2d.hpp" -#include "dl_tool.hpp" -#include "dl_variable.hpp" - -#include "unity.h" -#include - -using namespace dl; -using namespace nn; -using namespace tool; -using namespace base; -using namespace std; - -template -void sub2d_c(Tensor &output, - Tensor &input0, - Tensor &input1, - const Activation *const activation) -{ - int height = input0.shape[0]; // inputs and output are the same shape - int width = input0.shape[1]; - int channel = input0.shape[2]; - - feature_t *input0_element = input0.get_element_ptr(); - int input0_y_offset = input0.shape[1] * input0.shape[2]; - int input0_x_offset = input0.shape[2]; - - feature_t *input1_element = input1.get_element_ptr(); - int input1_y_offset = input1.shape[1] * input1.shape[2]; - int input1_x_offset = input1.shape[2]; - - feature_t *output_element = output.get_element_ptr(); // output - int output_y_offset = output.shape[1] * output.shape[2]; - int output_x_offset = output.shape[2]; - - int buffer = 0; - int max_input_exponent = DL_MAX(input0.exponent, input1.exponent); - int input0_shift = max_input_exponent - input0.exponent; - int input1_shift = max_input_exponent - input1.exponent; - int output_scale = 1; - int output_shift = output.exponent - max_input_exponent; - if (output_shift < 0) { - output_scale = 1 << (-output_shift); - output_shift = 0; - } - - activation_type_t activation_type = activation ? activation->type : Linear; - feature_t activation_alpha; - int activation_shift; - const feature_t *activation_alpha_ptr; - - switch (activation_type) { - case ReLU: - activation_alpha = 0; - activation_shift = 0; - activation_alpha_ptr = NULL; - break; - case LeakyReLU: - activation_alpha = activation->element[0]; - activation_shift = -activation->exponent; - activation_alpha_ptr = NULL; - break; - case PReLU: - activation_alpha = 0; - activation_alpha_ptr = activation->element; - activation_shift = -activation->exponent; - break; - default: - activation_alpha = 0; - activation_alpha_ptr = NULL; - activation_shift = -1; - break; - } - - for (size_t output_y = 0; output_y < height; output_y++) { - feature_t *input0_11c = input0_element; - feature_t *input1_11c = input1_element; - feature_t *output_11c = output_element; - - for (size_t output_x = 0; output_x < width; output_x++) { - for (size_t output_c = 0; output_c < channel; output_c++) { - buffer = (int)(DL_RIGHT_SHIFT(input0_11c[output_c], input0_shift)) - - (int)(DL_RIGHT_SHIFT(input1_11c[output_c], input1_shift)); - buffer = DL_RIGHT_SHIFT(buffer * output_scale, output_shift); - tool::truncate(output_11c[output_c], buffer); - if (activation_type == ReLU) { - output_11c[output_c] = DL_MAX(0, output_11c[output_c]); - } else if (activation_type == LeakyReLU) { - if (output_11c[output_c] < 0) { - buffer = DL_RIGHT_SHIFT((output_11c[output_c] * activation_alpha), activation_shift); - tool::truncate(output_11c[output_c], buffer); - } - } else if (activation_type == PReLU) { - if (output_11c[output_c] < 0) { - buffer = - DL_RIGHT_SHIFT((output_11c[output_c] * activation_alpha_ptr[output_c]), activation_shift); - tool::truncate(output_11c[output_c], buffer); - } - } - } - input0_11c += input0_x_offset; - input1_11c += input1_x_offset; - output_11c += output_x_offset; - } - - input0_element += input0_y_offset; - input1_element += input1_y_offset; - output_element += output_y_offset; - } -} - -template -Tensor sub2d_c(const int output_exponent, - Tensor &input0, - Tensor &input1, - const Activation *activation, - const std::vector &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -{ - assert(input0.is_same_shape(input1)); - - Tensor output; - output.set_exponent(output_exponent).set_shape(input0.shape).malloc_element(); - sub2d_c(output, input0, input1, activation); - - return output; -} - -bool test_sub_s8(int exponent0, - int exponent1, - int exponent_out, - int offset0, - int offset1, - int height, - int width, - int channel, - int activation_type, - bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent0) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1; - input1.set_element((int8_t *)&input1_element[offset1]) - .set_exponent(exponent1) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - if (activation_type == 0) { - Tensor output_c = sub2d_c(exponent_out, input0, input1, (Activation *)NULL); - latency.start(); - Tensor output = sub2d( - exponent_out, input0, input1, (Activation *)NULL); //(Activation *)NULL - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 1) { // relu - Tensor output_c = sub2d_c(exponent_out, input0, input1, &layer_activation_relu); - latency.start(); - Tensor output = sub2d(exponent_out, input0, input1, &layer_activation_relu); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 2) { // leakyrelu - Tensor output_c = sub2d_c(exponent_out, input0, input1, &layer_activation_lrelu); - latency.start(); - Tensor output = sub2d(exponent_out, input0, input1, &layer_activation_lrelu); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 3) { // prelu - Tensor output_c = sub2d_c(exponent_out, input0, input1, &layer_activation_prelu); - latency.start(); - Tensor output = sub2d(exponent_out, input0, input1, &layer_activation_prelu); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } - return false; - } else { - Tensor input0_tmp; - input0_tmp.set_element((int8_t *)&input0_element[offset0]) - .set_exponent(exponent0) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1_tmp; - input1_tmp.set_element((int8_t *)&input1_element[offset1]) - .set_exponent(exponent1) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0; - input0 = input0_tmp; - input0.set_auto_free(true); - - Tensor input1; - input1 = input1_tmp; - input1.set_auto_free(true); - - // output; - if (activation_type == 0) { - Tensor output_c = sub2d_c(exponent_out, input0, input1, (Activation *)NULL); - latency.start(); - sub2d(exponent_out, input0, input1, (Activation *)NULL); //(Activation *)NULL - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 1) { // relu - Tensor output_c = sub2d_c(exponent_out, input0, input1, &layer_activation_relu); - latency.start(); - sub2d(exponent_out, input0, input1, &layer_activation_relu); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 2) { // leakyrelu - Tensor output_c = sub2d_c(exponent_out, input0, input1, &layer_activation_lrelu); - latency.start(); - sub2d(exponent_out, input0, input1, &layer_activation_lrelu); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 3) { // prelu - Tensor output_c = sub2d_c(exponent_out, input0, input1, &layer_activation_prelu); - latency.start(); - sub2d(exponent_out, input0, input1, &layer_activation_prelu); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } - return false; - } -} - -bool test_sub_s16(int exponent0, - int exponent1, - int exponent_out, - int offset0, - int offset1, - int height, - int width, - int channel, - int activation_type, - bool inplace) -{ - if (!inplace) { - Tensor input0; - input0.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent0) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1; - input1.set_element((int16_t *)&input1_element_s16[offset1]) - .set_exponent(exponent1) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - // output; - if (activation_type == 0) { - Tensor output_c = sub2d_c(exponent_out, input0, input1, (Activation *)NULL); - latency.start(); - Tensor output = sub2d( - exponent_out, input0, input1, (Activation *)NULL); //(Activation *)NULL - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 1) { // relu - Tensor output_c = sub2d_c(exponent_out, input0, input1, &layer_activation_relu_s16); - latency.start(); - Tensor output = sub2d(exponent_out, input0, input1, &layer_activation_relu_s16); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 2) { // leakyrelu - Tensor output_c = sub2d_c(exponent_out, input0, input1, &layer_activation_lrelu_s16); - latency.start(); - Tensor output = sub2d(exponent_out, input0, input1, &layer_activation_lrelu_s16); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 3) { // prelu - Tensor output_c = sub2d_c(exponent_out, input0, input1, &layer_activation_prelu_s16); - latency.start(); - Tensor output = sub2d(exponent_out, input0, input1, &layer_activation_prelu_s16); - latency.end(); - latency.print(); - - return output.check_element(output_c.get_element_ptr(), 2, false); - } - return false; - } else { - Tensor input0_tmp; - input0_tmp.set_element((int16_t *)&input0_element_s16[offset0]) - .set_exponent(exponent0) - .set_shape({height, width, channel}) - .set_auto_free(false); - - Tensor input1_tmp; - input1_tmp.set_element((int16_t *)&input1_element_s16[offset1]) - .set_exponent(exponent1) - .set_shape({height, width, channel}) - .set_auto_free(false); - Latency latency; - - Tensor input0(input0_tmp, true); - input0.set_auto_free(true); - Tensor input1(input1_tmp, true); - input1.set_auto_free(true); - - // output; - if (activation_type == 0) { - Tensor output_c = sub2d_c(exponent_out, input0, input1, (Activation *)NULL); - latency.start(); - sub2d( - exponent_out, input0, input1, (Activation *)NULL); //(Activation *)NULL - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 1) { // relu - Tensor output_c = sub2d_c(exponent_out, input0, input1, &layer_activation_relu_s16); - latency.start(); - sub2d(exponent_out, input0, input1, &layer_activation_relu_s16); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 2) { // leakyrelu - Tensor output_c = sub2d_c(exponent_out, input0, input1, &layer_activation_lrelu_s16); - latency.start(); - sub2d(exponent_out, input0, input1, &layer_activation_lrelu_s16); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } else if (activation_type == 3) { // prelu - Tensor output_c = sub2d_c(exponent_out, input0, input1, &layer_activation_prelu_s16); - latency.start(); - sub2d(exponent_out, input0, input1, &layer_activation_prelu_s16); - latency.end(); - latency.print(); - - return input0.check_element(output_c.get_element_ptr(), 2, false); - } - return false; - } -} - -//------------------------------------------------------------------no -// inplace------------------------------------------------------------------ s8, c = 6, lrelu -TEST_CASE("test no inplace start", "[sub]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -TEST_CASE("test_no_scale_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -8, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input0_only_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -8, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input1_only_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -8, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input0_output_shift_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -7, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input1_output_shift_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -7, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input0_output_scale_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -9, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input1_output_scale_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -9, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_output_only_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -10, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_shift_output_only_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -7, 4, 7, 5, 7, 6, 2, false)); -} - -// s8, c = 16, prelu -TEST_CASE("test_no_scale_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -8, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input0_only_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -8, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input1_only_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -8, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input0_output_shift_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -7, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input1_output_shift_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -7, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input0_output_scale_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -9, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input1_output_scale_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -9, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_output_only_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -10, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_shift_output_only_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -7, 0, 0, 5, 7, 16, 3, false)); -} - -// s8, c = 35, linear -TEST_CASE("test_no_scale_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -8, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_input0_only_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -8, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_input1_only_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -8, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_input0_output_shift_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -7, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_input1_output_shift_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -7, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_input0_output_scale_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -9, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_input1_output_scale_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -9, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_output_only_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -10, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_shift_output_only_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -7, 0, 0, 5, 7, 35, 0, false)); -} - -// s16, c = 6, lrelu -TEST_CASE("test_no_scale_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -16, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input0_only_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -16, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input1_only_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -16, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input0_output_shift_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -15, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input1_output_shift_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -15, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input0_output_scale_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -17, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_input1_output_scale_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -17, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_scale_output_only_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -17, 4, 7, 5, 7, 6, 2, false)); -} - -TEST_CASE("test_shift_output_only_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -15, 4, 7, 5, 7, 6, 2, false)); -} - -// s16, c = 16, prelu -TEST_CASE("test_no_scale_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -16, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input0_only_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -16, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input1_only_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -16, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input0_output_shift_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -15, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input1_output_shift_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -15, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input0_output_scale_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -17, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_input1_output_scale_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -17, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_scale_output_only_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -17, 0, 0, 5, 7, 16, 3, false)); -} - -TEST_CASE("test_shift_output_only_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -15, 0, 0, 5, 7, 16, 3, false)); -} - -// s16, c = 35, linear -TEST_CASE("test_no_scale_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -16, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_input0_only_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -16, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_input1_only_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -16, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_input0_output_shift_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -15, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_input1_output_shift_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -15, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_input0_output_scale_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -17, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_input1_output_scale_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -17, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_scale_output_only_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -17, 0, 0, 5, 7, 35, 0, false)); -} - -TEST_CASE("test_shift_output_only_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -15, 0, 0, 5, 7, 35, 0, false)); -} - -//------------------------------------------------------------------inplace------------------------------------------------------------------ -// c = 6, lrelu -TEST_CASE("test inplace start", "[sub]") -{ - TEST_ASSERT(true); - printf("\n\n"); -} - -TEST_CASE("test_no_scale_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -8, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input0_only_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -8, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input1_only_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -8, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input0_output_shift_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -7, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input1_output_shift_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -7, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input0_output_scale_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -9, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input1_output_scale_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -9, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_output_only_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -10, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_shift_output_only_s8, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -7, 4, 7, 5, 7, 6, 2, true)); -} - -// s8, c = 16, prelu -TEST_CASE("test_no_scale_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -8, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input0_only_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -8, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input1_only_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -8, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input0_output_shift_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -7, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input1_output_shift_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -7, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input0_output_scale_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -9, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input1_output_scale_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -9, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_output_only_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -10, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_shift_output_only_s8, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -7, 0, 0, 5, 7, 16, 3, true)); -} - -// s8, c = 35, linear -TEST_CASE("test_no_scale_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -8, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_input0_only_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -8, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_input1_only_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -8, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_input0_output_shift_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -7, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_input1_output_shift_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -7, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_input0_output_scale_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-10, -8, -9, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_input1_output_scale_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -10, -9, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_output_only_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -10, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_shift_output_only_s8, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s8(-8, -8, -7, 0, 0, 5, 7, 35, 0, true)); -} - -// s16, c = 6, lrelu -TEST_CASE("test_no_scale_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -16, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input0_only_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -16, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input1_only_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -16, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input0_output_shift_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -15, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input1_output_shift_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -15, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input0_output_scale_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -17, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_input1_output_scale_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -17, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_scale_output_only_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -17, 4, 7, 5, 7, 6, 2, true)); -} - -TEST_CASE("test_shift_output_only_s16, c=6, lrelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -15, 4, 7, 5, 7, 6, 2, true)); -} - -// s16, c = 16, prelu -TEST_CASE("test_no_scale_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -16, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input0_only_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -16, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input1_only_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -16, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input0_output_shift_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -15, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input1_output_shift_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -15, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input0_output_scale_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -17, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_input1_output_scale_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -17, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_scale_output_only_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -17, 0, 0, 5, 7, 16, 3, true)); -} - -TEST_CASE("test_shift_output_only_s16, c=16, prelu", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -15, 0, 0, 5, 7, 16, 3, true)); -} - -// s16, c = 35, linear -TEST_CASE("test_no_scale_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -16, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_input0_only_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -16, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_input1_only_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -16, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_input0_output_shift_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -15, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_input1_output_shift_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -15, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_input0_output_scale_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-17, -16, -17, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_input1_output_scale_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -17, -17, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_scale_output_only_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -17, 0, 0, 5, 7, 35, 0, true)); -} - -TEST_CASE("test_shift_output_only_s16, c=35, linear", "[sub]") -{ - TEST_ASSERT(test_sub_s16(-16, -16, -15, 0, 0, 5, 7, 35, 0, true)); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_tensor.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_tensor.cpp deleted file mode 100644 index cb893de6..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_tensor.cpp +++ /dev/null @@ -1,225 +0,0 @@ -#include "unity.h" -#include -#include - -#include "esp_system.h" -#include "freertos/FreeRTOS.h" -#include "freertos/task.h" - -#include "dl_variable.hpp" - -using namespace dl; -using namespace std; - -TEST_CASE("Tensor", "shape") -{ - Tensor b; - b.set_shape({2, 3, 4}).malloc_element(); - for (int i = 0; i < b.get_size(); i++) { - b.element[i] = i; - } - - Tensor c; - c.set_shape({2, 3, 4}).malloc_element(); - for (int i = 0; i < b.get_size(); i++) { - c.element[i] = i + 100; - } - - Tensor x; - x.set_shape({2, 3, 4, 3, 2}).malloc_element(); - for (int i = 0; i < x.get_size(); i++) { - x.element[i] = i; - } - - Tensor a(b, true); - a.flatten(); - printf("\n---------------------------------------------------data--------------------------------------------------" - "-\n"); - a.print({}, "\na orig"); - b.print({}, "\nb orig"); - c.print({}, "\nc orig"); - x.print({}, "\nc orig"); - - printf("\n---------------------------------------------------slice-------------------------------------------------" - "--\n"); - Tensor d = b.slice({0, -1, 1, 3, -1, 4}); - d.print({}, "\nslice 1"); - - d = b.slice({0, 2, 0, 3, 0, 4}); - d.print({}, "\nslice 2"); - - d = b.slice({0, 3, 2, 100, 0, 100}); - d.print({}, "\nslice 3"); - - d = a.slice({-1, 100}); - d.print({}, "\nslice 4"); - d = a.slice({1, 20}); - d.print({}, "\nslice 5"); - - a.expand_dims(1); - d = a.slice({1, 20, 0, 1}); - d.print({}, "\nslice 6"); - a.expand_dims(0); - d = a.slice({0, 1, 1, 20, 0, 1}); - d.print({}, "\nslice 7"); - - a.squeeze(); - printf("\n---------------------------------------------------set value (T) " - "---------------------------------------------------\n"); - d = b; - d.set_value(-6); - d.print({}, "\n set_value T 1"); - d.set_value(200); - d.print({}, "\n set_value T 2"); - - printf("\n---------------------------------------------------set value " - "(Tensor)---------------------------------------------------\n"); - Tensor temp; - - d.set_value(b); - d.print({}, "\n set_value Tensor 1"); - - temp = b.slice({0, 1, 0, 3, 0, 4}); - temp.print({}, "\n temp"); - d.set_value(temp); - d.print({}, "set_value Tensor 2"); - - temp = b.slice({0, 100, 0, 1, 0, 100}); - temp.print({}, "\n temp"); - d.set_value(temp); - d.print({}, "\n set_value Tensor 3"); - - temp = b.slice({0, 100, 0, 100, 0, 1}); - temp.print({}, "\n temp"); - d.set_value(temp); - d.print({}, "set_value Tensor 4"); - - d.flatten().set_value(a); - d.print({}, "\nset_value Tensor 5"); - - temp = a.slice({2, 3}); - d.set_value(temp); - d.print({}, "\nset_value Tensor 6"); - - temp = a.slice({2, 3}).expand_dims({0, 2}); - d.expand_dims({0, 2}).set_value(temp); - d.print({}, "\nset_value Tensor 7"); - - printf("\n---------------------------------------------------set value (slice, " - "T)---------------------------------------------------\n"); - d.reshape({2, 3, -1}).set_value(b); - d.set_value({0, 2, 1, 3, 0, 3}, 100); - d.print({}, "\nset_value slice T 1"); - - d.set_value(b); - d.set_value({-1, 2, 0, 1, 3, 4}, 100); - d.print({}, "\nset_value slice T 2"); - - d.set_value(b); - d.set_value({0, 100, 0, 100, 0, 100}, 100); - d.print({}, "\nset_value slice T 3"); - - d.flatten().set_value(a); - d.set_value({0, -1}, -1); - d.print({}, "\nset_value slice T 4"); - - d.set_value({3, 99}, 66); - d.print({}, "\nset_value slice T 5"); - - printf("\n---------------------------------------------------set value (slice, " - "Tensor)---------------------------------------------------\n"); - - d.reshape({2, 3, -1}); - temp = c.slice({0, 2, 1, 3, 0, -1}); - temp.print({}, "\n temp"); - d.set_value(b).set_value({0, 2, 1, 3, 0, 3}, temp); - d.print({}, "set_value slice Tensor 1"); - - temp = c.slice({0, 2, 1, 2, 0, 1}); - temp.print({}, "\n temp"); - d.set_value(b).set_value({0, 2, 1, 2, 0, 1}, temp); - d.print({}, "set_value slice Tensor 2"); - - temp = c.slice({0, 2, 1, 2, 0, 5}); - temp.print({}, "\n temp"); - d.set_value(b).set_value({0, 2, 1, 2, 0, 5}, temp); - d.print({}, "set_value slice Tensor 3"); - - temp = c.slice({0, 1, 1, 5, 0, 1}); - temp.print({}, "\n temp"); - d.set_value(b).set_value({0, 1, 1, 5, 0, 1}, temp); - d.print({}, "set_value slice Tensor 4"); - - temp = c.slice({0, 2, 1, 3, 0, 1}); - temp.print({}, "\n temp"); - d.set_value(b).set_value({0, 2, 1, 3, 0, 3}, temp); - d.print({}, "set_value slice Tensor 5"); - - temp = c.slice({0, 2, 1, 2, 0, 1}); - temp.print({}, "\n temp"); - d.set_value(b).set_value({0, 2, 1, 2, 0, 100}, temp); - d.print({}, "set_value slice Tensor 6"); - - temp = c.slice({0, 2, 1, 2, 0, 5}); - temp.print({}, "\n temp"); - d.set_value(b).set_value({0, 2, 1, 100, 0, 5}, temp); - d.print({}, "set_value slice Tensor 7"); - - temp = c.slice({0, 1, 1, 5, 0, 1}); - temp.print({}, "\n temp"); - d.set_value(b).set_value({0, 100, 1, 5, 0, 100}, temp); - d.print({}, "set_value slice Tensor 8"); - - d = x; - temp = c.slice({0, 2, 1, 2, 0, 2}).expand_dims({0, 4}); - temp.print({}, "\n temp"); - d.set_value({0, 100, 0, 2, 1, 100, 0, 2, 0, 100}, temp); - d.print({}, "set_value slice Tensor 9"); - - printf("\n---------------------------------------------------reverse-----------------------------------------------" - "----\n"); - - d = b; - d.reverse({0}); - d.print({}, "\nreverse {0}"); - - d = b; - d.reverse({1}); - d.print({}, "\nreverse {1}"); - - d = b; - d.reverse({2}); - d.print({}, "\nreverse {2}"); - - d = b; - d.reverse({-1}); - d.print({}, "\nreverse {-1}"); - - d = b; - d.reverse({-2}); - d.print({}, "\nreverse {-2}"); - - d = b; - d.reverse({1, 0}); - d.print({}, "\nreverse {1, 0}"); - - d = b; - d.reverse({1, 2}); - d.print({}, "\nreverse {1, 2}"); - - d = b; - d.reverse({2, 0}); - d.print({}, "\nreverse {2, 0}"); - - d = b; - d.reverse({2, 0, 1}); - d.print({}, "\nreverse {2, 0, 1}"); - - d = b; - d.reverse({}); - d.print({}, "\nreverse {}"); - - d = a; - d.reverse({-1}); - d.print({}, "\nreverse {-1}"); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_tool.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_tool.hpp deleted file mode 100644 index 00f18e24..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/test_tool.hpp +++ /dev/null @@ -1,24 +0,0 @@ -#pragma once - -#include -#include - -inline void random_value(int8_t &value, int32_t low = INT8_MIN, int32_t high = INT8_MAX) -{ - value = (rand() % (high - low + 1)) + low; -} - -inline void random_array(int8_t *array, const int length) -{ - for (size_t i = 0; i < length; i++) random_value(array[i]); -} - -inline void random_value(int16_t &value, int32_t low = INT16_MIN, int32_t high = INT16_MAX) -{ - value = (rand() % (high - low + 1)) + low; -} - -inline void random_array(int16_t *array, const int length) -{ - for (size_t i = 0; i < length; i++) random_value(array[i]); -} diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/testcase.py b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/testcase.py deleted file mode 100644 index 0d892926..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/test/testcase.py +++ /dev/null @@ -1,605 +0,0 @@ -import os -import time -from pathlib import Path -import tensorflow as tf -import numpy -import sys - -sys.path.append(os.getcwd() + "/../..") -from convert.utils import Export, quantize, CHIP_LIST, ACTIVATION_LIST - - -def get_sign_width(t: str): - assert t in ("u8", "s8", "s16") - return "u" if t[0] == "u" else "", eval(t[1:]) - - -def get_range(s: str, w: int): - if s == "u": - return 0, 2**w - 1 - else: - return -(2 ** (w - 1)), 2 ** (w - 1) - 1 - - -class TestDL: - def __init__( - self, target_chip: str, step: int = 25, total: int = 100, quant_method: int = 0 - ): - assert target_chip in CHIP_LIST - - self.target_chip = target_chip - self.root = "./" - self.step = step - self.total = total - self.case = [] - self.quant_method = quant_method - - with open(f"{self.root}/CMakeLists.txt", "r") as file: - self.backup = file.read() - return - - def delete_case(self): - while len(self.case) > 0: - case = self.case.pop() - while not Path(f"{self.root}/{case}.cpp").exists(): - print(".", end="") - time.sleep(0.5) - os.remove(f"{self.root}/{case}.cpp") - return - - def wait(self) -> bool: - with open(f"{self.root}/CMakeLists.txt", "w") as file: - all_cpp = ".cpp\n ".join(self.case) - file.writelines( - ( - f"set(srcs \n {all_cpp}.cpp\n)\n", - "set(requires unity dl)\n\n", - "idf_component_register(SRCS ${srcs} REQUIRES ${requires})\n\n", - ) - ) - - input_str = "" - while not input_str.isnumeric() and not input_str == "exit": - input_str = input( - f'>>> {len(self.case)} TestCases were generated\nInput a number to continue or "exit" to exit: ' - ) - - self.delete_case() - - if input_str.isnumeric() and input_str != "0": - self.step = eval(input_str) - else: - return False - - return True - - def testcase( - self, - name: str, - feature_type: str, - input_shape: tuple, - filter_shape: tuple, - stride: tuple, - dilation: tuple, - operation: str, - activation_types: tuple, - ): - """ - - :param name: - :param feature_type: - :param filter_shape: - :param stride: (1, stride_y, stride_x, 1) - :param dilation: (dilation_y, dilation_x) - :param operation: - :return: - """ - test_unaligned = (True, False)[ - numpy.random.randint(0, 2) - ] # actually must choose one - - print("input_shape: %s, filter_shape: %s" % (input_shape, filter_shape)) - output_channel = filter_shape[3] if operation == "conv2d" else input_shape[-1] - padding = ("VALID", "SAME")[numpy.random.randint(0, 2)] - - if operation in ("conv2d", "depthwise_conv2d"): - with_bias = (True, False)[numpy.random.randint(0, 2)] - else: - with_bias = False - - feature_sign, feature_element_width = get_sign_width(feature_type) - feature_low, feature_high = get_range(feature_sign, feature_element_width) - bias_low, bias_high = get_range(feature_sign, feature_element_width // 2) - - activation_type = activation_types[ - numpy.random.randint(0, len(activation_types)) - ] - - if ( - operation not in ("conv2d", "depthwise_conv2d") - or feature_element_width == 16 - ): - self.quant_method = 0 - if self.quant_method == 1 and feature_element_width == 8: - bias_low, bias_high = get_range(feature_sign, 16 // 2) - - # input - input_exponent = -feature_element_width - input_q = numpy.random.randint(feature_low, feature_high, input_shape).astype( - float - ) - input_f = input_q * 2**input_exponent - - # filter - if self.quant_method == 0: # per-layer - filter_exponent = -feature_element_width - filter_q = numpy.random.randint( - feature_low, feature_high, filter_shape - ).astype(float) - filter_f = filter_q * 2**filter_exponent - elif self.quant_method == 1: # int8 per-channel - filter_exponent = numpy.random.randint(-7, -4, output_channel).astype(float) - filter_q = numpy.random.randint( - feature_low, feature_high, filter_shape - ).astype(float) - filter_f = filter_q * 2**filter_exponent - - # output - output_f = None - if operation == "conv2d": - output_f = tf.nn.conv2d( - input_f, filter_f, stride, padding, dilations=dilation - ) - elif operation == "depthwise_conv2d": - output_f = tf.nn.depthwise_conv2d( - input_f, filter_f, stride, padding, dilations=dilation - ) - elif operation == "avg_pool2d": - output_f = tf.nn.avg_pool2d(input_f, filter_shape, stride, padding) - elif operation == "global_avg_pool2d": - output_f = tf.nn.avg_pool2d(input_f, filter_shape, 1, "VALID") - - if output_f is not None: - _, output_exponent, _ = quantize(operation, output_f, feature_element_width) - else: - output_exponent = input_exponent - - if operation in ("conv2d", "depthwise_conv2d"): - mac_shift = input_exponent + filter_exponent - output_exponent - else: - mac_shift = input_exponent - output_exponent - - if operation == "conv2d": - output_q = tf.nn.conv2d( - input_q, filter_q, stride, padding, dilations=dilation - ) - elif operation == "depthwise_conv2d": - output_q = tf.nn.depthwise_conv2d( - input_q, filter_q, stride, padding, dilations=dilation - ) - elif operation == "max_pool2d": - output_q = tf.nn.max_pool2d(input_q, filter_shape, stride, padding) - elif operation == "avg_pool2d": - output_q = tf.nn.avg_pool2d(input_q, filter_shape, stride, padding) - elif operation == "global_max_pool2d": - output_q = tf.nn.max_pool2d(input_q, filter_shape, 1, "VALID") - elif operation == "global_avg_pool2d": - output_q = tf.nn.avg_pool2d(input_q, filter_shape, 1, "VALID") - - if self.quant_method == 1 and feature_element_width == 8 and with_bias: - output_q = output_q.numpy().astype(int) - output_q = output_q >> 4 - # output_q = numpy.clip(output_q, -2 ** 15, 2 ** 15 - 1).astype(int) - - bias_exponent = input_exponent + filter_exponent + 4 - bias_q = numpy.random.randint(bias_low, bias_high, output_channel).astype( - int - ) - output_q += bias_q - output_q = numpy.floor(output_q * (2 ** (mac_shift + 4))) - output_q = numpy.clip(output_q, feature_low, feature_high).astype(int) - else: - output_q = output_q.numpy() - output_q = output_q * (2**mac_shift) - output_q = numpy.clip(output_q, feature_low, feature_high) - - # bias - bias_q = None - bias_exponent = output_exponent - if with_bias and "conv" in operation: - bias_q = numpy.random.randint( - bias_low, bias_high, output_channel - ).astype(float) - output_q += bias_q - output_q = numpy.clip(output_q, feature_low, feature_high) - - # activation - alpha_q = None - alpha_e = None - if "conv" in operation: - if activation_type == "ReLU": - output_q = numpy.where(output_q < 0, 0, output_q) - elif activation_type == "LeakyReLU": - alpha_q = numpy.random.randint(feature_low, feature_high, 1) - alpha_e = -feature_element_width - output_q = numpy.where( - output_q < 0, output_q * alpha_q[0] * (2**alpha_e), output_q - ) - alpha_q = alpha_q.astype(int) - elif activation_type == "PReLU": - alpha_q = numpy.random.randint( - feature_low, feature_high, output_channel - ) - alpha_e = -feature_element_width - for i in range(output_channel): - output_q[..., i] = numpy.where( - output_q[..., i] < 0, - output_q[..., i] * alpha_q[i] * 2**alpha_e, - output_q[..., i], - ) - alpha_q = alpha_q.astype(int) - else: # 'Linear' - pass - - output_q = numpy.clip(output_q, feature_low, feature_high) - - # export to c - padding_dl = "PADDING_VALID" if padding == "VALID" else "PADDING_SAME_END" - - if test_unaligned: - aligned = False - print_addr = ' printf("input_element: %p\\n", &input_element);\n' - else: - aligned = True - print_addr = "" - - with open(f"{self.root}/{name}.cpp", "w") as file: - file.writelines( - ( - "#include \n", - "\n", - '#include "dl_constant.hpp"\n', - '#include "dl_variable.hpp"\n', - '#include "dl_tool.hpp"\n', - '#include "dl_define.hpp"\n', - f'#include "dl_nn_{operation.lower()}.hpp"\n\n', - "#include \n", - '#include "unity.h"\n\n', - "using namespace dl;\n", - "using namespace nn;\n", - "using namespace tool;\n\n", - ) - ) - tool = Export(target_chip=self.target_chip, source_file=file, indent="") - - tool.export_element( - name="input", - array=input_q.astype(int), - array_type=feature_type, - aligned=aligned, - ) - tool.export_element( - name="output", - array=output_q.astype(int), - array_type=feature_type, - aligned=aligned, - ) - - if "conv2d" in operation: - tool( - name="layer", - operation=operation, - feature_type=feature_type, - filter_element=filter_q.astype(int), - filter_exponent=filter_exponent, - stride=stride, - dilation=dilation, - bias_element=None if bias_q is None else bias_q.astype(int), - bias_exponent=bias_exponent, - activation_type=activation_type, - activation_element=None if alpha_q is None else alpha_q.astype(int), - activation_exponent=alpha_e, - quant_granularity=self.quant_method, - ) - - if self.quant_method == 1 and feature_element_width == 8: - bias_str = "&layer_bias" if with_bias else f"(Bias *)NULL" - else: - bias_str = ( - "&layer_bias" - if with_bias - else f"(Bias *)NULL" - ) - activation_str = ( - "&layer_activation" - if activation_type - else f"(Activation *)NULL" - ) - input_shape_str = ( - input_shape[1:].__str__().replace("(", "{").replace(")", "}") - ) - filter_shape_str = ( - filter_shape.__str__().replace("(", "{").replace(")", "}") - ) - if operation.count("conv2d") > 0: - file.writelines( - ( - f'\nTEST_CASE("{name}", "[{operation}]")\n', - "{\n", - f" Tensor<{feature_sign}int{feature_element_width}_t> input;\n", - f" input.set_element(({feature_sign}int{feature_element_width}_t *)input_element).set_exponent({input_exponent}).set_shape({input_shape_str}).set_auto_free(false);\n\n", - f"{print_addr}", - f" Tensor<{feature_sign}int{feature_element_width}_t> output = {operation}({output_exponent}, input, layer_filter, {stride[1]}, {stride[2]}, {padding_dl}, {bias_str}, {activation_str});\n", - f" TEST_ASSERT(output.check_element(({feature_sign}int{feature_element_width}_t *)output_element, 2, false));\n", - "}\n", - ) - ) - elif operation.count("global") > 0: - file.writelines( - ( - f'\nTEST_CASE("{name}", "[{operation}]")\n', - "{\n", - f" Tensor<{feature_sign}int{feature_element_width}_t> input;\n", - f" input.set_element(({feature_sign}int{feature_element_width}_t *)input_element).set_exponent({input_exponent}).set_shape({input_shape_str}).set_auto_free(false);\n\n", - f"{print_addr}", - ( - f" Tensor<{feature_sign}int{feature_element_width}_t> output = {operation}(input);\n" - if operation.count("max") > 0 - else f" Tensor<{feature_sign}int{feature_element_width}_t> output = {operation}({output_exponent}, input);\n" - ), - f" TEST_ASSERT(output.check_element(({feature_sign}int{feature_element_width}_t *)output_element, 5, false));\n", - "}\n", - ) - ) - elif operation.count("pool2d") > 0: - file.writelines( - ( - f'\nTEST_CASE("{name}", "[{operation}]")\n', - "{\n", - f" Tensor<{feature_sign}int{feature_element_width}_t> input;\n", - f" input.set_element(({feature_sign}int{feature_element_width}_t *)input_element).set_exponent({input_exponent}).set_shape({input_shape_str}).set_auto_free(false);\n\n", - f"{print_addr}", - ( - f" Tensor<{feature_sign}int{feature_element_width}_t> output = {operation}(input, {filter_shape_str}, {stride[1]}, {stride[2]}, {padding_dl});\n" - if operation.count("max") > 0 - else f" Tensor<{feature_sign}int{feature_element_width}_t> output = {operation}({output_exponent}, input, {filter_shape_str}, {stride[1]}, {stride[2]}, {padding_dl});\n" - ), - f" TEST_ASSERT(output.check_element(({feature_sign}int{feature_element_width}_t *)output_element, 8, false));\n", - "}\n", - ) - ) - - else: - raise ValueError(f"operation {operation} is not supported.") - return - - def __call__( - self, - operation: str, - feature_type: str, - input_shape: tuple = None, - filter_shape: tuple = None, - stride: tuple = None, - dilation: tuple = None, - activation: tuple = None, - ): - assert operation in ( - "conv2d", - "depthwise_conv2d", - "max_pool2d", - "avg_pool2d", - "global_max_pool2d", - "global_avg_pool2d", - ) - assert feature_type in ("s16", "s8") - - is_continue = True - i = 0 - for _ in range(self.total): - i += 1 - - if operation == "conv2d": - if filter_shape is None: - _filter_shape = tuple(numpy.random.randint(1, 8, 2)) + tuple( - numpy.random.randint(1, 48, 2) - ) - elif len(filter_shape) == 2: - _filter_shape = filter_shape + tuple(numpy.random.randint(1, 48, 2)) - elif len(filter_shape) == 4: - _filter_shape = filter_shape - else: - raise ValueError(f"filter_shape can not be {filter_shape}.") - - if dilation is None: - _dilation = tuple(numpy.random.randint(1, 5, 2)) - else: - _dilation = dilation - - if stride is None: - _stride = (1,) + tuple(numpy.random.randint(1, 5, 2)) + (1,) - else: - _stride = stride - - elif operation == "depthwise_conv2d": - if filter_shape is None: - _filter_shape = tuple(numpy.random.randint(1, 8, 2)) + ( - numpy.random.randint(1, 256), - 1, - ) - elif len(filter_shape) == 2: - _filter_shape = ( - filter_shape + tuple(numpy.random.randint(1, 256, 1)) + (1,) - ) - elif len(filter_shape) == 4: - _filter_shape = filter_shape - else: - raise ValueError(f"filter_shape can not be {filter_shape}.") - - if dilation is None: - _dilation = tuple(numpy.random.randint(1, 5, 2)) - else: - _dilation = dilation - - if _dilation[0] == _dilation[1] == 1: - if stride is None: - s = numpy.random.randint(1, 5) - _stride = (1, s, s, 1) - else: - _stride = stride - else: - if stride is None: - _stride = (1, 1, 1, 1) - elif stride[1] == stride[2] == 1: - _stride = stride - else: - raise ValueError( - "When dilation_x != 1 or dilation_y !=1, TF only support stride = (1, 1, 1, 1)" - ) - - elif operation in ("max_pool2d", "avg_pool2d"): - if filter_shape is None: - _filter_shape = tuple(numpy.random.randint(1, 8, 2)) - elif len(filter_shape) == 2: - _filter_shape = filter_shape - else: - raise ValueError(f"filter_shape can not be {filter_shape}.") - - _dilation = (1, 1) - if stride is None: - s = numpy.random.randint(1, 5) - _stride = (1, s, s, 1) - else: - _stride = stride - - if operation in ("global_max_pool2d", "global_avg_pool2d"): - _stride = 1 - _dilation = (1, 1) - if input_shape is None: - if filter_shape is None: - _filter_shape = tuple(numpy.random.randint(1, 8, 2)) - elif len(filter_shape) == 2: - _filter_shape = filter_shape - else: - raise ValueError(f"filter_shape can not be {filter_shape}.") - - input_height = _filter_shape[0] - input_width = _filter_shape[1] - input_channel = numpy.random.randint(1, 256) - _input_shape = (1, input_height, input_width, input_channel) - else: - if filter_shape is None: - _filter_shape = tuple(numpy.random.randint(1, 8, 2)) - _filter_shape[0] = input_shape[0] - _filter_shape[1] = input_shape[1] - elif len(filter_shape) == 2: - assert _filter_shape[0] == input_shape[0] - assert _filter_shape[1] == input_shape[1] - else: - raise ValueError(f"filter_shape can not be {filter_shape}.") - - input_channel = numpy.random.randint(1, 256) - _input_shape = (1, input_height, input_width, input_channel) - elif input_shape is None: - input_shape_min = max( - (_filter_shape[0] - 1) * _dilation[0] + 1, - (_filter_shape[1] - 1) * _dilation[1] + 1, - ) - input_shape_max = ( - max( - ((_filter_shape[0] - 1) * _dilation[0] + 1) * _stride[1], - ((_filter_shape[1] - 1) * _dilation[1] + 1) * _stride[2], - ) - * 2 - ) - input_height = numpy.random.randint(input_shape_min, input_shape_max) - input_width = numpy.random.randint(input_shape_min, input_shape_max) - if operation in ("conv2d", "depthwise_conv2d"): - _input_shape = (1, input_height, input_width, _filter_shape[2]) - else: - input_channel = numpy.random.randint(1, 256) - _input_shape = (1, input_height, input_width, input_channel) - else: - if operation in ("conv2d", "depthwise_conv2d"): - _input_shape = (1,) + input_shape + (_filter_shape[2],) - else: - input_channel = numpy.random.randint(1, 256) - _input_shape = (1,) + input_shape + (input_channel,) - - self.testcase( - name=f"test_{i}", - feature_type=feature_type, - input_shape=_input_shape, - filter_shape=_filter_shape, - stride=_stride, - dilation=_dilation, - operation=operation, - activation_types=ACTIVATION_LIST if activation is None else activation, - ) - self.case.append(f"test_{i}") - - # CMakeLists.txt - if len(self.case) == self.step: - is_continue = self.wait() - - if not is_continue: - break - - if is_continue and len(self.case): - self.wait() - - with open(f"{self.root}/CMakeLists.txt", "w") as file: - file.write(self.backup) - return - - def __del__(self): - self.delete_case() - return - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="Test Assistant") - parser.add_argument("--target_chip", help=f"{CHIP_LIST}") - parser.add_argument( - "--operation", - help="conv2d, depthwise_conv2d, max_pool2d, avg_pool2d, global_max_pool2d, global_avg_pool2d", - ) - parser.add_argument("--feature_type", help="s16, s8", default="s16") - parser.add_argument("--input_shape", help='"(H, W)"', default="None") - parser.add_argument("--filter_shape", help='"(H, W), (H, W, C, N)"', default="None") - parser.add_argument("--stride", help='"(1, y, x, 1)"', default="None") - parser.add_argument("--dilation", help='"(y, x)"', default="None") - parser.add_argument( - "--activation", - help='"(None, \\"ReLU\\", \\"LeakyReLU\\", \\"PReLU\\")"', - default="None", - ) - parser.add_argument( - "--step", help="Wait for every this number of testcases", default="20" - ) - parser.add_argument("--total", help="The total of testcases", default="100") - parser.add_argument( - "--quant", help="The quantization method of filter", default="0" - ) - args = parser.parse_args() - - if all((args.target_chip, args.operation)): - test_dl = TestDL( - target_chip=args.target_chip, - step=eval(args.step), - total=eval(args.total), - quant_method=eval(args.quant), - ) - test_dl( - operation=args.operation, - feature_type=args.feature_type, - input_shape=eval(args.input_shape), - filter_shape=eval(args.filter_shape), - stride=eval(args.stride), - dilation=eval(args.dilation), - activation=eval(args.activation), - ) - else: - parser.print_help() - quit() diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/include/dl_esp32p4_cache_reg.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/include/dl_esp32p4_cache_reg.hpp deleted file mode 100644 index 9c66a589..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/include/dl_esp32p4_cache_reg.hpp +++ /dev/null @@ -1,24 +0,0 @@ -#include "soc/cache_reg.h" - -/* i 0-core0, 1-core1*/ -#define L1_ICACHE_ACS_HIT_CNT_REG_n(i) (CACHE_L1_IBUS0_ACS_HIT_CNT_REG + 0x10 * i) -#define L1_ICACHE_ACS_MISS_CNT_REG_n(i) (CACHE_L1_IBUS0_ACS_MISS_CNT_REG + 0x10 * i) -#define L1_ICACHE_ACS_CONFLICT_CNT_REG_n(i) (CACHE_L1_IBUS0_ACS_CONFLICT_CNT_REG + 0x10 * i) -#define L1_ICACHE_ACS_NXTLVL_CNT_REG_n(i) (CACHE_L1_IBUS0_ACS_NXTLVL_RD_CNT_REG + 0x10 * i) - -#define L1_DCACHE_ACS_HIT_CNT_REG_n(i) (CACHE_L1_DBUS0_ACS_HIT_CNT_REG + 0x14 * i) -#define L1_DCACHE_ACS_MISS_CNT_REG_n(i) (CACHE_L1_DBUS0_ACS_MISS_CNT_REG + 0x14 * i) -#define L1_DCACHE_ACS_CONFLICT_CNT_REG_n(i) (CACHE_L1_DBUS0_ACS_CONFLICT_CNT_REG + 0x14 * i) -#define L1_DCACHE_ACS_NXTLVL_CNT_REG_n(i) (CACHE_L1_DBUS0_ACS_NXTLVL_RD_CNT_REG + 0x14 * i) -#define L1_DCACHE_ACS_NXTLVL_WR_CNT_REG_n(i) (CACHE_L1_DBUS0_ACS_NXTLVL_WR_CNT_REG + 0x14 * i) - -#define L2_ICACHE_ACS_HIT_CNT_REG_n(i) (CACHE_L2_IBUS0_ACS_HIT_CNT_REG + 0x10 * i) -#define L2_ICACHE_ACS_MISS_CNT_REG_n(i) (CACHE_L2_IBUS0_ACS_MISS_CNT_REG + 0x10 * i) -#define L2_ICACHE_ACS_CONFLICT_CNT_REG_n(i) (CACHE_L2_IBUS0_ACS_CONFLICT_CNT_REG + 0x10 * i) -#define L2_ICACHE_ACS_NXTLVL_CNT_REG_n(i) (CACHE_L2_IBUS0_ACS_NXTLVL_RD_CNT_REG + 0x10 * i) - -#define L2_DCACHE_ACS_HIT_CNT_REG_n(i) (CACHE_L2_DBUS0_ACS_HIT_CNT_REG + 0x14 * i) -#define L2_DCACHE_ACS_MISS_CNT_REG_n(i) (CACHE_L2_DBUS0_ACS_MISS_CNT_REG + 0x14 * i) -#define L2_DCACHE_ACS_CONFLICT_CNT_REG_n(i) (CACHE_L2_DBUS0_ACS_CONFLICT_CNT_REG + 0x14 * i) -#define L2_DCACHE_ACS_NXTLVL_CNT_REG_n(i) (CACHE_L2_DBUS0_ACS_NXTLVL_RD_CNT_REG + 0x14 * i) -#define L2_DCACHE_ACS_NXTLVL_WR_CNT_REG_n(i) (CACHE_L2_DBUS0_ACS_NXTLVL_WR_CNT_REG + 0x14 * i) diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/include/dl_tool.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/include/dl_tool.hpp deleted file mode 100644 index 56f93d2e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/include/dl_tool.hpp +++ /dev/null @@ -1,602 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "esp_cpu.h" -#include "esp_log.h" -#include "esp_system.h" -#include "esp_timer.h" -#include "freertos/FreeRTOS.h" - -#include "dl_define.hpp" -#if CONFIG_ESP32P4_BOOST -#include "dl_esp32p4_cache_reg.hpp" -#endif - -extern "C" { -#if CONFIG_TIE728_BOOST -void dl_tie728_memset_8b(void *ptr, const int value, const int n); -void dl_tie728_memset_16b(void *ptr, const int value, const int n); -void dl_tie728_memset_32b(void *ptr, const int value, const int n); -#endif - -#if CONFIG_ESP32P4_BOOST -typedef enum { - ROUND_MODE_FLOOR = 0, - ROUND_MODE_CEILING = 1, - ROUND_MODE_UP = 2, - ROUND_MODE_DOWN = 3, - ROUND_MODE_HALF_UP = 4, - ROUND_MODE_HALF_DOWN = 5, - ROUND_MODE_HALF_EVEN = 6, - ROUND_MODE_UNNECESSARY = 7, - MODEL_LOCATION_MAX = ROUND_MODE_UNNECESSARY, -} round_mode_t; - -void dl_esp32p4_cfg_round(round_mode_t round_mode); -int dl_esp32p4_round_half_even(float value); -#endif -} - -namespace dl { -namespace tool { - -/** - * @brief Encapsulate the round strategies for different platforms. - * - * @param value The float value. - * @return int - */ -int round(float value); - -/** - * @brief Set memory zero. - * - * @param ptr pointer of memory - * @param n byte number - */ -void set_zero(void *ptr, const int n); - -/** - * @brief Set array value. - * - * @tparam T supports all data type, sizeof(T) equals to 1, 2 and 4 will boost by instruction - * @param ptr pointer of array - * @param value value to set - * @param len length of array - */ -template -void set_value(T *ptr, const T value, const int len) -{ -#if CONFIG_TIE728_BOOST - int *temp = (int *)&value; - if (sizeof(T) == 1) - dl_tie728_memset_8b(ptr, *temp, len); - else if (sizeof(T) == 2) - dl_tie728_memset_16b(ptr, *temp, len); - else if (sizeof(T) == 4) - dl_tie728_memset_32b(ptr, *temp, len); - else -#endif - for (size_t i = 0; i < len; i++) ptr[i] = value; -} - -/** - * @brief Copy memory. - * - * @param dst pointer of destination - * @param src pointer of source - * @param n byte number - */ -void copy_memory(void *dst, void *src, const size_t n); - -/** - * @brief Apply memory without initialized. Can use free_aligned() to free the memory. - * - * @param number number of elements - * @param size size of element - * @param align number of byte aligned, e.g., 16 means 16-byte aligned - * @param caps Bitwise OR of MALLOC_CAP_* flags indicating the type of memory to be returned - * @return pointer of allocated memory. NULL for failed - */ -inline void *malloc_aligned(int number, int size, int align, uint32_t &caps) -{ - assert((align > 0) && (((align & (align - 1)) == 0))); - int total_size = number * size; - - void *res = heap_caps_aligned_alloc(align, total_size, caps); - if (!res && caps != MALLOC_CAP_8BIT) { - ESP_LOGW(__FUNCTION__, "heap_caps_aligned_alloc failed, retry with MALLOC_CAP_8BIT"); - res = heap_caps_aligned_alloc(align, total_size, MALLOC_CAP_8BIT); - caps = MALLOC_CAP_8BIT; - } - -#if CONFIG_ESP32P4_BOOST - // skip the TCM memory. - if (reinterpret_cast(res) >= 0x30100000 && reinterpret_cast(res) <= 0x30101fff) { - ESP_LOGW(__FUNCTION__, "Malloc skip the TCM memory"); - heap_caps_free(res); - res = NULL; - } -#if DL_SPIRAM_SUPPORT - if (NULL == res) { - res = heap_caps_aligned_alloc(align, total_size, MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM); - caps = MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM; - } -#endif -#endif - - if (NULL == res) { - caps = MALLOC_CAP_8BIT; - ESP_LOGE(__FUNCTION__, - "Fail to malloc %d bytes from DRAM(%d bytyes) and PSRAM(%d bytes), PSRAM is %s.\n", - total_size, - heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL), - heap_caps_get_free_size(MALLOC_CAP_SPIRAM), - DL_SPIRAM_SUPPORT ? "on" : "off"); - } - return res; -} - -inline void *malloc_aligned(int number, int size, int align, uint32_t &&caps) -{ - uint32_t caps_tmp = caps; - return malloc_aligned(number, size, align, caps_tmp); -} - -/** - * @brief Apply memory with zero-initialized. Can use free_aligned() to free the memory. - * - * @param number number of elements - * @param size size of element - * @param align number of byte aligned, e.g., 16 means 16-byte aligned - * @param caps Bitwise OR of MALLOC_CAP_* flags indicating the type of memory to be returned - * @return pointer of allocated memory. NULL for failed - */ -inline void *calloc_aligned(int number, int size, int align, uint32_t &caps) -{ - assert((align > 0) && (((align & (align - 1)) == 0))); - void *res = heap_caps_aligned_calloc(align, number, size, caps); - if (!res && caps != MALLOC_CAP_8BIT) { - ESP_LOGW(__FUNCTION__, "heap_caps_aligned_calloc failed, retry with MALLOC_CAP_8BIT"); - res = heap_caps_aligned_calloc(align, number, size, MALLOC_CAP_8BIT); - caps = MALLOC_CAP_8BIT; - } - -#if CONFIG_ESP32P4_BOOST - // skip the TCM memory. - if (reinterpret_cast(res) >= 0x30100000 && reinterpret_cast(res) <= 0x30101fff) { - ESP_LOGW(__FUNCTION__, "Calloc skip the TCM memory"); - heap_caps_free(res); - res = NULL; - } -#if DL_SPIRAM_SUPPORT - if (NULL == res) { - res = heap_caps_aligned_calloc(align, number, size, MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM); - caps = MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM; - } -#endif -#endif - - if (NULL == res) { - caps = MALLOC_CAP_8BIT; - ESP_LOGE(__FUNCTION__, - "Fail to malloc %d bytes from DRAM(%d bytyes) and PSRAM(%d bytes), PSRAM is %s.\n", - number * size, - heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL), - heap_caps_get_free_size(MALLOC_CAP_SPIRAM), - DL_SPIRAM_SUPPORT ? "on" : "off"); - } - return res; -} - -inline void *calloc_aligned(int number, int size, int align, uint32_t &&caps) -{ - uint32_t caps_tmp = caps; - return calloc_aligned(number, size, align, caps_tmp); -} - -/** - * @brief Free the calloc_aligned() and malloc_aligned() memory - * - * @param address pointer of memory to free - */ -inline void free_aligned(void *address) -{ - if (NULL == address) - return; - - heap_caps_free(address); -} - -template -struct PSRAMAllocator { - typedef T value_type; - - PSRAMAllocator() = default; - - template - constexpr PSRAMAllocator(const PSRAMAllocator &) noexcept - { - } - - template - struct rebind { - using other = PSRAMAllocator; - }; - - T *allocate(std::size_t n) - { - if (n > std::numeric_limits::max() / sizeof(T)) { - return nullptr; - } - if (auto p = static_cast(heap_caps_malloc(n * sizeof(T), MALLOC_CAP_SPIRAM))) { - return p; - } - return nullptr; - } - - void deallocate(T *p, std::size_t) noexcept { heap_caps_free(p); } -}; - -template -bool operator==(const PSRAMAllocator &, const PSRAMAllocator &) -{ - return true; -} - -template -bool operator!=(const PSRAMAllocator &, const PSRAMAllocator &) -{ - return false; -} - -/** - * @brief Truncate the input into int8_t range. - * - * @tparam T supports all integer types - * @param output as an output - * @param input as an input - */ -template -void truncate(int8_t &output, T input) -{ - output = DL_CLIP(input, INT8_MIN, INT8_MAX); -} - -/** - * @brief Truncate the input into int16_t range. - * - * @tparam T supports all integer types - * @param output as an output - * @param input as an input - */ -template -void truncate(int16_t &output, T input) -{ - output = DL_CLIP(input, INT16_MIN, INT16_MAX); -} - -template -void truncate(int32_t &output, T input) -{ - output = DL_CLIP(input, INT32_MIN, INT32_MAX); -} - -template -void truncate(int64_t &output, T input) -{ - output = DL_CLIP(input, INT64_MIN, INT64_MAX); -} - -#if CONFIG_ESP32P4_BOOST -inline int calculate_exponent(int n, int max_value) -{ - int exp; - if (127 == max_value) { - exp = 6; - } - if (32767 == max_value) { - exp = 14; - } - float max_value_float = (float)max_value / (1 << exp); - while (max_value_float > 1.f / n) { - exp += 1; - max_value_float /= 2; - } - - exp -= 1; - return exp; -} -#else -/** - * @brief Calculate the exponent of quantizing 1/n into max_value range. - * - * @param n 1/n: value to be quantized - * @param max_value the max_range - */ -inline int calculate_exponent(int n, int max_value) -{ - int exp = 0; - int tmp = 1 / n; - while (tmp < max_value) { - exp += 1; - tmp = (1 << exp) / n; - } - exp -= 1; - - return exp; -} -#endif - -/** - * @brief Print vector in format "[x1, x2, ...]\n". - * - * @param array to print - */ -inline void print_vector(std::vector &array, const char *message = NULL) -{ - if (message) - printf("%s: ", message); - - printf("["); - for (int i = 0; i < array.size(); i++) { - printf(", %d" + (i ? 0 : 2), array[i]); - } - printf("]\n"); -} - -/** - * @brief Get the cycle object - * - * @return cycle count - */ -inline uint32_t get_cycle() -{ - uint32_t ccount; - // __asm__ __volatile__("rsr %0, ccount" - // : "=a"(ccount) - // : - // : "memory"); - ccount = esp_cpu_get_cycle_count(); - return ccount; -} - -class Latency { -private: - const uint32_t size; /**/ -#if CONFIG_ESP32P4_BOOST && DL_LOG_CACHE_COUNT - uint32_t l2dbus_hit_cnt_s; - uint32_t l2dbus_hit_cnt_e; - uint32_t l2dbus_miss_cnt_s; - uint32_t l2dbus_miss_cnt_e; - uint32_t l2dbus_conflict_cnt_s; - uint32_t l2dbus_conflict_cnt_e; - uint32_t l2dbus_nxtlvl_cnt_s; // 访问下一级存储(flash/psram)计数 - uint32_t l2dbus_nxtlvl_cnt_e; - uint32_t l1dbus_hit_cnt_s; - uint32_t l1dbus_hit_cnt_e; - uint32_t l1dbus_miss_cnt_s; - uint32_t l1dbus_miss_cnt_e; - uint32_t l1dbus_conflict_cnt_s; - uint32_t l1dbus_conflict_cnt_e; - uint32_t l1dbus_nxtlvl_cnt_s; - uint32_t l1dbus_nxtlvl_cnt_e; - - uint32_t l2ibus_hit_cnt_s; - uint32_t l2ibus_hit_cnt_e; - uint32_t l2ibus_miss_cnt_s; - uint32_t l2ibus_miss_cnt_e; - uint32_t l2ibus_nxtlvl_cnt_s; - uint32_t l2ibus_nxtlvl_cnt_e; - uint32_t l1ibus_hit_cnt_s; - uint32_t l1ibus_hit_cnt_e; - uint32_t l1ibus_miss_cnt_s; - uint32_t l1ibus_miss_cnt_e; - uint32_t l1ibus_nxtlvl_cnt_s; - uint32_t l1ibus_nxtlvl_cnt_e; -#endif - -public: - /** - * @brief Construct a new Latency object. - * - * @param size - */ - Latency(const uint32_t size = 1) : size(size), period(0), sum(0), count(0), next(0) - { - this->queue = (this->size > 1) ? (uint32_t *)calloc(this->size, sizeof(uint32_t)) : NULL; -#if CONFIG_ESP32P4_BOOST && DL_LOG_CACHE_COUNT - REG_WRITE(CACHE_L1_CACHE_ACS_CNT_CTRL_REG, ~0); - REG_WRITE(CACHE_L2_CACHE_ACS_CNT_CTRL_REG, ~0); -#endif - } - - /** - * @brief Destroy the Latency object. - * - */ - ~Latency() - { - if (this->queue) - free(this->queue); - } - - /** - * @brief Record the start timestamp. - * - */ - void start() - { -#if DL_LOG_LATENCY_UNIT - this->timestamp = get_cycle(); -#if CONFIG_ESP32P4_BOOST && DL_LOG_CACHE_COUNT - this->l2dbus_hit_cnt_s = REG_READ(L2_DCACHE_ACS_HIT_CNT_REG_n(xPortGetCoreID())); - this->l2dbus_miss_cnt_s = REG_READ(L2_DCACHE_ACS_MISS_CNT_REG_n(xPortGetCoreID())); - this->l2dbus_conflict_cnt_s = REG_READ(L2_DCACHE_ACS_CONFLICT_CNT_REG_n(xPortGetCoreID())); - this->l2dbus_nxtlvl_cnt_s = REG_READ(L2_DCACHE_ACS_NXTLVL_CNT_REG_n(xPortGetCoreID())); - - this->l1dbus_hit_cnt_s = REG_READ(L1_DCACHE_ACS_HIT_CNT_REG_n(xPortGetCoreID())); - this->l1dbus_miss_cnt_s = REG_READ(L1_DCACHE_ACS_MISS_CNT_REG_n(xPortGetCoreID())); - this->l1dbus_conflict_cnt_s = REG_READ(L1_DCACHE_ACS_CONFLICT_CNT_REG_n(xPortGetCoreID())); - this->l1dbus_nxtlvl_cnt_s = REG_READ(L1_DCACHE_ACS_NXTLVL_CNT_REG_n(xPortGetCoreID())); - - this->l2ibus_hit_cnt_s = REG_READ(L2_ICACHE_ACS_HIT_CNT_REG_n(xPortGetCoreID())); - this->l2ibus_miss_cnt_s = REG_READ(L2_ICACHE_ACS_MISS_CNT_REG_n(xPortGetCoreID())); - this->l2ibus_nxtlvl_cnt_s = REG_READ(L2_ICACHE_ACS_NXTLVL_CNT_REG_n(xPortGetCoreID())); - - this->l1ibus_hit_cnt_s = REG_READ(L1_ICACHE_ACS_HIT_CNT_REG_n(xPortGetCoreID())); - this->l1ibus_miss_cnt_s = REG_READ(L1_ICACHE_ACS_MISS_CNT_REG_n(xPortGetCoreID())); - this->l1ibus_nxtlvl_cnt_s = REG_READ(L1_ICACHE_ACS_NXTLVL_CNT_REG_n(xPortGetCoreID())); -#endif -#else - this->timestamp = esp_timer_get_time(); -#endif - } - - /** - * @brief Record the period. - * - */ - void end() - { -#if DL_LOG_LATENCY_UNIT - this->period = get_cycle() - this->timestamp; -#if CONFIG_ESP32P4_BOOST && DL_LOG_CACHE_COUNT - this->l2dbus_hit_cnt_e = REG_READ(L2_DCACHE_ACS_HIT_CNT_REG_n(xPortGetCoreID())); - this->l2dbus_miss_cnt_e = REG_READ(L2_DCACHE_ACS_MISS_CNT_REG_n(xPortGetCoreID())); - this->l2dbus_conflict_cnt_e = REG_READ(L2_DCACHE_ACS_CONFLICT_CNT_REG_n(xPortGetCoreID())); - this->l2dbus_nxtlvl_cnt_e = REG_READ(L2_DCACHE_ACS_NXTLVL_CNT_REG_n(xPortGetCoreID())); - - this->l1dbus_hit_cnt_e = REG_READ(L1_DCACHE_ACS_HIT_CNT_REG_n(xPortGetCoreID())); - this->l1dbus_miss_cnt_e = REG_READ(L1_DCACHE_ACS_MISS_CNT_REG_n(xPortGetCoreID())); - this->l1dbus_conflict_cnt_e = REG_READ(L1_DCACHE_ACS_CONFLICT_CNT_REG_n(xPortGetCoreID())); - this->l1dbus_nxtlvl_cnt_e = REG_READ(L1_DCACHE_ACS_NXTLVL_CNT_REG_n(xPortGetCoreID())); - - this->l2ibus_hit_cnt_e = REG_READ(L2_ICACHE_ACS_HIT_CNT_REG_n(xPortGetCoreID())); - this->l2ibus_miss_cnt_e = REG_READ(L2_ICACHE_ACS_MISS_CNT_REG_n(xPortGetCoreID())); - this->l2ibus_nxtlvl_cnt_e = REG_READ(L2_ICACHE_ACS_NXTLVL_CNT_REG_n(xPortGetCoreID())); - - this->l1ibus_hit_cnt_e = REG_READ(L1_ICACHE_ACS_HIT_CNT_REG_n(xPortGetCoreID())); - this->l1ibus_miss_cnt_e = REG_READ(L1_ICACHE_ACS_MISS_CNT_REG_n(xPortGetCoreID())); - this->l1ibus_nxtlvl_cnt_e = REG_READ(L1_ICACHE_ACS_NXTLVL_CNT_REG_n(xPortGetCoreID())); -#endif -#else - this->period = esp_timer_get_time() - this->timestamp; -#endif - if (this->queue) { - this->sum -= this->queue[this->next]; - this->queue[this->next] = this->period; - this->sum += this->queue[this->next]; - this->next++; - this->next = this->next % this->size; - if (this->count < this->size) { - this->count++; - } - } - } - - /** - * @brief Return the period. - * - * @return this->timestamp_end - this->timestamp - */ - uint32_t get_period() { return this->period; } - - /** - * @brief Get the average period. - * - * @return average latency - */ - uint32_t get_average_period() { return this->queue ? (this->sum / this->count) : this->period; } - - /** - * @brief Clear the period - * - */ - void clear_period() { this->period = 0; } - - /** - * @brief Print in format "latency: {this->period} {unit}\n". - */ - void print() - { -#if DL_LOG_LATENCY_UNIT - printf("latency: %15lu cycle\n", this->get_average_period()); -#else - printf("latency: %15lu us\n", this->get_average_period()); -#endif - } - - /** - * @brief Print in format "{message}: {this->period} {unit}\n". - * - * @param message message of print - */ - void print(const char *message) - { -#if DL_LOG_LATENCY_UNIT - printf("%s: %15lu cycle\n", message, this->get_average_period()); -#else - printf("%s: %15lu us\n", message, this->get_average_period()); -#endif - } - - /** - * @brief Print in format "{prefix}::{key}: {this->period} {unit}\n". - * - * @param prefix prefix of print - * @param key key of print - */ - void print(const char *prefix, const char *key, bool debug = true) - { -#if DL_LOG_LATENCY_UNIT - printf("%s::%s: %lu cycle\n", prefix, key, this->get_average_period()); -#if CONFIG_ESP32P4_BOOST && DL_LOG_CACHE_COUNT - printf("%s::%s: l2 dcache, hit cnt: %lu\n", prefix, key, this->l2dbus_hit_cnt_e - this->l2dbus_hit_cnt_s); - printf("%s::%s: l2 dcache, miss cnt: %lu\n", prefix, key, this->l2dbus_miss_cnt_e - this->l2dbus_miss_cnt_s); - printf("%s::%s: l2 dcache, conflict cnt: %lu\n", - prefix, - key, - this->l2dbus_conflict_cnt_e - this->l2dbus_conflict_cnt_s); - printf( - "%s::%s: l2 dcache, nxtlvl cnt: %lu\n", prefix, key, this->l2dbus_nxtlvl_cnt_e - this->l2dbus_nxtlvl_cnt_s); - printf("%s::%s: l1 dcache, hit cnt: %lu\n", prefix, key, this->l1dbus_hit_cnt_e - this->l1dbus_hit_cnt_s); - printf("%s::%s: l1 dcache, miss cnt: %lu\n", prefix, key, this->l1dbus_miss_cnt_e - this->l1dbus_miss_cnt_s); - printf("%s::%s: l1 dcache, conflict cnt: %lu\n", - prefix, - key, - this->l1dbus_conflict_cnt_e - this->l1dbus_conflict_cnt_s); - printf( - "%s::%s: l1 dcache, nxtlvl cnt: %lu\n", prefix, key, this->l1dbus_nxtlvl_cnt_e - this->l1dbus_nxtlvl_cnt_s); - printf("%s::%s: l2 icache, hit cnt: %lu\n", prefix, key, this->l2ibus_hit_cnt_e - this->l2ibus_hit_cnt_s); - printf("%s::%s: l2 icache, miss cnt: %lu\n", prefix, key, this->l2ibus_miss_cnt_e - this->l2ibus_miss_cnt_s); - printf( - "%s::%s: l2 icache, nxtlvl cnt: %lu\n", prefix, key, this->l2ibus_nxtlvl_cnt_e - this->l2ibus_nxtlvl_cnt_s); - printf("%s::%s: l1 icache, hit cnt: %lu\n", prefix, key, this->l1ibus_hit_cnt_e - this->l1ibus_hit_cnt_s); - printf("%s::%s: l1 icache, miss cnt: %lu\n", prefix, key, this->l1ibus_miss_cnt_e - this->l1ibus_miss_cnt_s); - printf( - "%s::%s: l1 icache, nxtlvl cnt: %lu\n", prefix, key, this->l1ibus_nxtlvl_cnt_e - this->l1ibus_nxtlvl_cnt_s); -#endif -#else - if (debug) - ESP_LOGD("latency", "%s::%s: %lu us\n", prefix, key, this->get_average_period()); - else - printf("%s::%s: %lu us\n", prefix, key, this->get_average_period()); -#endif - } -}; -} // namespace tool -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/include/dl_tool_cache.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/include/dl_tool_cache.hpp deleted file mode 100644 index c35a90f2..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/include/dl_tool_cache.hpp +++ /dev/null @@ -1,71 +0,0 @@ -#pragma once - -#include - -#if CONFIG_IDF_TARGET_ESP32S3 -#include "esp32s3/rom/cache.h" -#include "soc/extmem_reg.h" -#endif - -namespace dl { -namespace tool { -namespace cache { -/** - * @brief Initialize preload. - * - * @param preload One of 1 or 0, - * - 1: turn on the preload - * - 0: turn off the preload - * @return - * - 1: Initialize successfully - * - 0: Initialize successfully, autoload has been turned off - * - -1: Initialize failed, the chip does not support preload - */ -int8_t preload_init(uint8_t preload = 1); - -/** - * @brief Preload memory. - * - * @param addr the start address of data to be preloaded - * @param size the size of the data in byte to be preloaded - */ -void preload_func(uint32_t addr, uint32_t size); - -/** - * @brief Initialize autoload. - * - * @param autoload One of 1 or 0, - * - 1: turn on the autoload - * - 0: turn off the autoload - * @param trigger One of 0 or 1 or 2, - * - 0: miss, TODO:@yuanjiong - * - 1: hit, TODO:@yuanjiong - * - 2: both,TODO:@yuanjiong - * @param line_size the number of cache lines to be autoloaded - * @return status, - * - 1: Initialize sucessfully - * - 0: Initialize suceesfully, preload has been turned off - * - -1: Initialize failed, the chip does not support autoload - */ -int8_t autoload_init(uint8_t autoload = 1, uint8_t trigger = 2, uint8_t line_size = 0); - -/** - * @brief Autoload memory. - * - * @param addr1 the start address of data1 to be autoloaded - * @param size1 the size of the data1 in byte to be preloaded - * @param addr2 the start address of data2 to be autoloaded - * @param size2 the size of the data2 in byte to be preloaded - */ -void autoload_func(uint32_t addr1, uint32_t size1, uint32_t addr2, uint32_t size2); - -/** - * @brief Autoload memory. - * - * @param addr1 the start address of data1 to be autoloaded - * @param size1 the size of the data1 in byte to be preloaded - */ -void autoload_func(uint32_t addr1, uint32_t size1); -} // namespace cache -} // namespace tool -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/esp32p4/dl_esp32p4_memcpy.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/esp32p4/dl_esp32p4_memcpy.S deleted file mode 100644 index 52c1c332..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/esp32p4/dl_esp32p4_memcpy.S +++ /dev/null @@ -1,162 +0,0 @@ - - .text - .align 2 - .global dl_esp32p4_memcpy - .type dl_esp32p4_memcpy, @function - .balign 4 - .option norvc -dl_esp32p4_memcpy: - - # a0: void *store_ptr - # a1: const void *load_ptr - # a2: const int length(bytes) - - # a3: length // 16 - # a4: remainder - # a5: length // 32 - # t3: odd_flag - # t4: store_ptr sar_bytes / remainder_4b - # t5: remainder_1b - # t6: load_ptr sar_bytes - - # a6(not for extension instructions): head unaligned bytes 2 - # a7(not for extension instructions): - # t0(not for extension instructions): tmp value - # t1(not for extension instructions): - # t2(not for extension instructions): - # s2(not for extension instructions): - # s3(not for extension instructions): - # s4(not for extension instructions): - # s5(not for extension instructions): - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - esp.ld.128.usar.ip q0, a1, 0 - esp.movx.r.sar.bytes t6 - - esp.ld.128.usar.ip q1, a0, 0 - esp.movx.r.sar.bytes t4 - li a6, 16 - sub a6, a6, t4 // head unaligned bytes 2 - - li t0, 16 - beq a6, t0, 13f - blt a6, a2, dl_esp32p4_memcpy_done_min - mv a6, a2 -dl_esp32p4_memcpy_done_min: - srli t6, a6, 2 - slli a7, t6, 2 - sub a7, a6, a7 - - mv t0, t6 - blez t0, 10f -9: - lw a3, 0(a1) - addi a1, a1, 4 - sw a3, 0(a0) - addi a0, a0, 4 - addi t0, t0, -1 - bgtz t0, 9b -10: - mv t0, a7 - blez t0, 12f -11: - lbu a3, 0(a1) - addi a1, a1, 1 - sb a3, 0(a0) - addi a0, a0, 1 - addi t0, t0, -1 - bgtz t0, 11b -12: - sub a2, a2, a6 - esp.ld.128.usar.ip q0, a1, 0 - esp.movx.r.sar.bytes t6 -13: - beqz t6, 1f - srli a3, a2, 4 // len // 16 - slli a4, a3, 4 - sub a4, a2, a4 // remainder - - srli a5, a3, 1 // len // 32 - slli t3, a5, 1 - sub t3, a3, t3 // odd_flag - - srli t4, a4, 2 //remainder_4b - slli t5, t4, 2 - sub t5, a4, t5 //remainder_1b - - mv t0, a5 - blez t0, 15f -14: - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q1, a1, 16 - esp.ld.128.usar.ip q2, a1, 0 - esp.src.q q0, q0, q1 - esp.src.q q1, q1, q2 - esp.vst.128.ip q0, a0, 16 - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 14b - -15: - beqz t3, 4f - esp.ld.128.usar.ip q0, a1, 16 - esp.ld.128.usar.ip q1, a1, 0 - esp.src.q q0, q0, q1 - esp.vst.128.ip q0, a0, 16 - bnez t3, 4f - -1: - srli a3, a2, 4 // len // 16 - slli a4, a3, 4 - sub a4, a2, a4 // remainder - - srli a5, a3, 1 // len // 32 - slli t3, a5, 1 - sub t3, a3, t3 // odd_flag - - srli t4, a4, 2 //remainder_4b - slli t5, t4, 2 - sub t5, a4, t5 //remainder_1b - - mv t0, a5 - blez t0, 3f -2: - esp.vld.128.ip q0, a1, 16 - esp.vld.128.ip q1, a1, 16 - esp.vst.128.ip q0, a0, 16 - esp.vst.128.ip q1, a0, 16 - addi t0, t0, -1 - bgtz t0, 2b -3: - beqz t3, 4f - esp.vld.128.ip q0, a1, 16 - esp.vst.128.ip q0, a0, 16 -4: - mv t0, t4 - blez t0, 6f -5: - lw a3, 0(a1) - addi a1, a1, 4 - sw a3, 0(a0) - addi a0, a0, 4 - addi t0, t0, -1 - bgtz t0, 5b -6: - - mv t0, t5 - blez t0, 8f -7: - lbu a3, 0(a1) - addi a1, a1, 1 - sb a3, 0(a0) - addi a0, a0, 1 - addi t0, t0, -1 - bgtz t0, 7b -8: - ret \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/esp32p4/dl_esp32p4_round.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/esp32p4/dl_esp32p4_round.S deleted file mode 100644 index 8eb37302..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/esp32p4/dl_esp32p4_round.S +++ /dev/null @@ -1,64 +0,0 @@ - .text - .global dl_esp32p4_cfg_round - .type dl_esp32p4_cfg_round, @function - .balign 4 - .option norvc -dl_esp32p4_cfg_round: - - # a0: int8_t round_mode - # a1: - # a2: - - # a3: - # a4: - # a5: - # t3: - # t4: - # t5: - # t6: - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - slli a0, a0, 4 - esp.movx.r.cfg a1 - or a1, a1, a0 - esp.movx.w.cfg a1 - ret - - - - .text - .global dl_esp32p4_round_half_even - .type dl_esp32p4_round_half_even, @function - .balign 4 - .option norvc -dl_esp32p4_round_half_even: - - # fa0: float value - - # a0: int32_t ret - # a1: - # a2: - - # a3: - # a4: - # a5: - # t3: - # t4: - # t5: - # t6: - - # s0: - # s1: - # s8: - # s9: - # s10: - # s11: - - fcvt.w.s a0, fa0, rne - ret \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/tie728/dl_tie728_bzero.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/tie728/dl_tie728_bzero.S deleted file mode 100644 index 56c91963..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/tie728/dl_tie728_bzero.S +++ /dev/null @@ -1,73 +0,0 @@ - .align 4 - .text - .global dl_tie728_bzero_128b - .type dl_tie728_bzero_128b, @function - .section .iram1 -dl_tie728_bzero_128b: - .align 4 - entry sp, 32 - - # a2: ptr - # a3: n - - srli a3, a3, 4 # a3: n // 16 - - EE.ZERO.Q q0 - loopgtz a3, 1f - EE.VST.128.IP q0, a2, 16 -1: - retw - - - .align 4 - .text - .global dl_tie728_bzero - .type dl_tie728_bzero, @function - .section .iram1 -dl_tie728_bzero: - .align 4 - entry sp, 32 - - # a2: ptr - # a3: n(bytes) - - movi a10, 0 - EE.LD.128.USAR.IP q1, a2, 0 - RUR.SAR_BYTE a8 - beqz a8, 1f - movi a9, 16 - sub a9, a9, a8 # head unaligned bytes - - min a9, a9, a3 - loopgtz a9, 0f - s8i a10, a2, 0 - addi a2, a2, 1 - -0: - sub a3, a3, a9 - blti a3, 1, 4f -1: - srli a4, a3, 4 # n // 16 - slli a5, a4, 4 - sub a5, a3, a5 # remainder - - srli a6, a5, 2 #remainder_4b - slli a7, a6, 2 - sub a7, a5, a7 #remainder_1b - - - EE.ZERO.Q q0 - loopgtz a4, 2f - EE.VST.128.IP q0, a2, 16 -2: - loopgtz a6, 3f - s32i a10, a2, 0 - addi a2, a2, 4 -3: - loopgtz a7, 4f - s8i a10, a2, 0 - addi a2, a2, 1 -4: - retw - - diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/tie728/dl_tie728_memcpy.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/tie728/dl_tie728_memcpy.S deleted file mode 100644 index e7b2bc9e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/tie728/dl_tie728_memcpy.S +++ /dev/null @@ -1,110 +0,0 @@ - .align 4 - .text - .global dl_tie728_memcpy - .type dl_tie728_memcpy, @function - .section .iram1 -dl_tie728_memcpy: - .align 4 - entry sp, 32 - # a2: store_ptr - # a3: load_ptr - # a4: length(bytes) - - EE.LD.128.USAR.IP q0, a3, 0 - RUR.SAR_BYTE a13 - movi a11, 16 - sub a11, a11, a13 # head unaligned bytes 1 - - EE.LD.128.USAR.IP q1, a2, 0 - RUR.SAR_BYTE a9 - movi a12, 16 - sub a12, a12, a9 # head unaligned bytes 2 - - beqi a12, 16, 11f - min a12, a12, a4 - srli a13, a12, 2 - slli a14, a13, 2 - sub a14, a12, a14 - - loopgtz a13, 9f - l32i a5, a3, 0 - addi a3, a3, 4 - s32i a5, a2, 0 - addi a2, a2, 4 -9: - loopgtz a14, 10f - l8ui a5, a3, 0 - addi a3, a3, 1 - s8i a5, a2, 0 - addi a2, a2, 1 - -10: - sub a4, a4, a12 - EE.LD.128.USAR.IP q0, a3, 0 - RUR.SAR_BYTE a13 -11: - beqz a13, 1f - srli a5, a4, 4 # len // 16 - slli a6, a5, 4 - sub a6, a4, a6 # remainder - - srli a7, a5, 1 # len // 32 - slli a8, a7, 1 - sub a8, a5, a8 # odd_flag - - srli a9, a6, 2 #remainder_4b - slli a10, a9, 2 - sub a10, a6, a10 #remainder_1b - - loopgtz a7, 12f - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q1, a3, 16 - EE.LD.128.USAR.IP q2, a3, 0 - EE.SRC.Q q0, q0, q1 - EE.SRC.Q q1, q1, q2 - EE.VST.128.IP q0, a2, 16 - EE.VST.128.IP q1, a2, 16 -12: - beqz a8, 3f - EE.LD.128.USAR.IP q0, a3, 16 - EE.LD.128.USAR.IP q1, a3, 0 - EE.SRC.Q q0, q0, q1 - EE.VST.128.IP q0, a2, 16 - bnez a8, 3f - -1: - srli a5, a4, 4 # len // 16 - slli a6, a5, 4 - sub a6, a4, a6 # remainder - - srli a7, a5, 1 # len // 32 - slli a8, a7, 1 - sub a8, a5, a8 # odd_flag - - srli a9, a6, 2 #remainder_4b - slli a10, a9, 2 - sub a10, a6, a10 #remainder_1b - - loopgtz a7, 2f - EE.VLD.128.IP q0, a3, 16 - EE.VLD.128.IP q1, a3, 16 - EE.VST.128.IP q0, a2, 16 - EE.VST.128.IP q1, a2, 16 -2: - beqz a8, 3f - EE.VLD.128.IP q0, a3, 16 - EE.VST.128.IP q0, a2, 16 -3: - loopgtz a9, 4f - l32i a5, a3, 0 - addi a3, a3, 4 - s32i a5, a2, 0 - addi a2, a2, 4 -4: - loopgtz a10, 5f - l8ui a5, a3, 0 - addi a3, a3, 1 - s8i a5, a2, 0 - addi a2, a2, 1 -5: - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/tie728/dl_tie728_memset.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/tie728/dl_tie728_memset.S deleted file mode 100644 index 37ee3f65..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/tie728/dl_tie728_memset.S +++ /dev/null @@ -1,158 +0,0 @@ - .align 4 - .text - .global dl_tie728_memset_8b - .type dl_tie728_memset_8b, @function - .section .iram1 -dl_tie728_memset_8b: - .align 4 - entry sp, 32 - # a2: input_ptr - # a3: value - # a4: length(n) - - s8i a3, a1, 0 // store to stack - mov a7, a1 - EE.VLDBC.8 q0, a7 - - EE.LD.128.USAR.IP q1, a2, 0 - RUR.SAR_BYTE a10 - beqz a10, 2f - movi a11, 16 - sub a11, a11, a10 # head unaligned bytes - - min a11, a11, a4 - loopgtz a11, 1f - s8i a3, a2, 0 - addi a2, a2, 1 - -1: - sub a4, a4, a11 - blti a4, 1, 4f - -2: - srli a5, a4, 4 # len // 16 - slli a6, a5, 4 - sub a6, a4, a6 # remainder - - loopgtz a5, 3f - EE.VST.128.IP q0, a2, 16 -3: - loopgtz a6, 4f - s8i a3, a2, 0 - addi a2, a2, 1 -4: - retw - - - - .align 4 - .text - .global dl_tie728_memset_16b - .type dl_tie728_memset_16b, @function - .section .iram1 -dl_tie728_memset_16b: - .align 4 - entry sp, 32 - # a2: input_ptr - # a3: value_ptr - # a4: length(n) - - s16i a3, a1, 0 // store to stack - mov a7, a1 - EE.VLDBC.16 q0, a7 - - EE.LD.128.USAR.IP q1, a2, 0 - RUR.SAR_BYTE a10 - beqz a10, 2f - movi a11, 16 - sub a11, a11, a10 # head unaligned bytes - - movi a8, 2 - rems a9, a11, a8 - beqz a9, 0f - loopgtz a4, 5f - s16i a3, a2, 0 - addi a2, a2, 2 -5: - retw - -0: - srli a11, a11, 1 - min a11, a11, a4 - loopgtz a11, 1f - s16i a3, a2, 0 - addi a2, a2, 2 - -1: - sub a4, a4, a11 - blti a4, 1, 4f - -2: - srli a5, a4, 3 # len // 8 - slli a6, a5, 3 - sub a6, a4, a6 # remainder - - loopgtz a5, 3f - EE.VST.128.IP q0, a2, 16 -3: - loopgtz a6, 4f - s16i a3, a2, 0 - addi a2, a2, 2 -4: - retw - - - .align 4 - .text - .global dl_tie728_memset_32b - .type dl_tie728_memset_32b, @function - .section .iram1 -dl_tie728_memset_32b: - .align 4 - entry sp, 32 - # a2: input_ptr - # a3: value_ptr - # a4: length(n) - - s32i a3, a1, 0 // store to stack - mov a7, a1 - EE.VLDBC.32 q0, a7 - - EE.LD.128.USAR.IP q1, a2, 0 - RUR.SAR_BYTE a10 - beqz a10, 2f - movi a11, 16 - sub a11, a11, a10 # head unaligned bytes - - movi a8, 4 - rems a9, a11, a8 - beqz a9, 0f - loopgtz a4, 5f - s32i a3, a2, 0 - addi a2, a2, 4 -5: - retw -0: - srli a11, a11, 2 - min a11, a11, a4 - loopgtz a11, 1f - s32i a3, a2, 0 - addi a2, a2, 4 - -1: - sub a4, a4, a11 - blti a4, 1, 4f - -2: - srli a5, a4, 2 # len // 4 - slli a6, a5, 2 - sub a6, a4, a6 # remainder - - loopgtz a5, 3f - EE.VST.128.IP q0, a2, 16 -3: - loopgtz a6, 4f - s32i a3, a2, 0 - addi a2, a2, 4 -4: - retw \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/xtensa/dl_xtensa_bzero.S b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/xtensa/dl_xtensa_bzero.S deleted file mode 100644 index 8cf2203c..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/isa/xtensa/dl_xtensa_bzero.S +++ /dev/null @@ -1,20 +0,0 @@ - .align 4 - .text - .global dl_xtensa_bzero_32b - .type dl_xtensa_bzero_32b, @function - .section .iram1 -dl_xtensa_bzero_32b: - .align 4 - entry sp, 32 - - # a2: ptr - # a3: n - - srli a3, a3, 2 # a3: n // 4 - movi a4, 0 # a4: zero - - loopgtz a3, 1f - s32i a4, a2, 0 - addi a2, a2, 4 -1: - retw diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/src/dl_tool.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/src/dl_tool.cpp deleted file mode 100644 index 5923f9d3..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/src/dl_tool.cpp +++ /dev/null @@ -1,87 +0,0 @@ -#include "dl_tool.hpp" -#include - -extern "C" { -#if CONFIG_XTENSA_BOOST -void dl_xtensa_bzero_32b(void *ptr, const int n); -#endif - -#if CONFIG_TIE728_BOOST -void dl_tie728_bzero_128b(void *ptr, const int n); -void dl_tie728_bzero(void *ptr, const int n); -void dl_tie728_memcpy(void *dst, const void *src, const size_t n); -#endif - -#if CONFIG_ESP32P4_BOOST -void dl_esp32p4_memcpy(void *dst, const void *src, const size_t n); -#endif -} - -namespace dl { -namespace tool { - -int round_half_even(float value) -{ -#if CONFIG_ESP32P4_BOOST - return dl_esp32p4_round_half_even(value); -#else - float rounded; - if (value < 0) { - rounded = value - 0.5f; - } else { - rounded = value + 0.5f; - } - - int int_part = (int)rounded; - if (rounded == (float)int_part) { - if (int_part % 2 != 0) { - if (value < 0) - int_part++; - else - int_part--; - } - } - return int_part; -#endif -} - -int round_half_up(float value) -{ - return (int)floorf(value + 0.5); -} - -int round_down(float value) -{ - return (int)floorf(value); -} - -int round(float value) -{ -#if CONFIG_IDF_TARGET_ESP32P4 - return round_half_even(value); -#else - return round_half_up(value); -#endif -} - -void set_zero(void *ptr, const int n) -{ -#if CONFIG_TIE728_BOOST - dl_tie728_bzero(ptr, n); -#else - bzero(ptr, n); -#endif -} - -void copy_memory(void *dst, void *src, const size_t n) -{ -#if CONFIG_ESP32P4_BOOST - dl_esp32p4_memcpy(dst, src, n); -#elif CONFIG_TIE728_BOOST - dl_tie728_memcpy(dst, src, n); -#else - memcpy(dst, src, n); -#endif -} -} // namespace tool -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/src/dl_tool_cache.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/src/dl_tool_cache.cpp deleted file mode 100644 index 1c6dbeb1..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/tool/src/dl_tool_cache.cpp +++ /dev/null @@ -1,164 +0,0 @@ -#include "dl_tool_cache.hpp" - -namespace dl { -namespace tool { -namespace cache { -static uint8_t autoload_trigger = 2; // TODO: typedef enum 提高可读性 -static uint8_t autoload_linesize = 0; -static uint8_t autoload_enable = 1; -static uint8_t preload_enable = 0; - -int8_t preload_init(uint8_t preload) -{ -#if CONFIG_IDF_TARGET_ESP32S3 - preload_enable = preload; - if (autoload_enable && preload_enable) { - Cache_Disable_DCache_Autoload(); - autoload_enable = 0; - printf("preload has been turned on, and autoload has been turned off\n"); - return 0; - } - return 1; -#endif - return -1; -} - -void preload_func(uint32_t addr, uint32_t size) -{ -#if CONFIG_IDF_TARGET_ESP32S3 - if (preload_enable && (!autoload_enable)) { - uint8_t enable = (addr < SOC_EXTRAM_DATA_HIGH) ? 1 : 0; - if (enable) { - while (!Cache_DCache_Preload_Done()) { - ; - } - Cache_Start_DCache_Preload(addr, size, 0); - } - } -#endif -} - -int8_t autoload_init(uint8_t autoload, uint8_t trigger, uint8_t line_size) -{ -#if CONFIG_IDF_TARGET_ESP32S3 -#if CONFIG_TIE728_BOOST - autoload_trigger = trigger; - autoload_linesize = line_size; - autoload_enable = autoload; - struct autoload_config config = { - autoload_enable, - CACHE_AUTOLOAD_POSITIVE, - autoload_trigger, - autoload_linesize, - }; - Cache_Config_DCache_Autoload(&config); - Cache_Enable_DCache_Autoload(); -#else - autoload_trigger = trigger; - autoload_linesize = line_size; - autoload_enable = autoload; -#endif - if (preload_enable && autoload_enable) { - while (!Cache_DCache_Preload_Done()) { - ; - } - preload_enable = 0; - printf("autoload has been turned on, and preload has been turned off\n"); - return 0; - } - return 1; -#endif - return -1; -} - -void autoload_func(uint32_t addr1, uint32_t size1, uint32_t addr2, uint32_t size2) -{ -#if CONFIG_IDF_TARGET_ESP32S3 -#if CONFIG_TIE728_BOOST - if (autoload_enable && (!preload_enable)) { - uint8_t input1_enable = (addr1 < SOC_EXTRAM_DATA_HIGH) ? 1 : 0; - uint8_t input2_enable = (addr2 < SOC_EXTRAM_DATA_HIGH) ? 1 : 0; - - // config first region - struct autoload_region_config region0 = { - 0, // region - input1_enable, // ena - addr1, // addr - size1, // autoload region size 0-0x03FFFFFF - }; - Cache_Config_DCache_Region_Autoload(®ion0); - - // config second region - struct autoload_region_config region1 = { - 1, // region - input2_enable, // ena - addr2, // addr - size2, // autoload region size 0-0x03FFFFFF - }; - Cache_Config_DCache_Region_Autoload(®ion1); - } -#else - if (autoload_enable && (!preload_enable)) { - Cache_Disable_DCache_Autoload(); - uint8_t input1_enable = (addr1 < SOC_EXTRAM_DATA_HIGH) ? 1 : 0; - uint8_t input2_enable = (addr2 < SOC_EXTRAM_DATA_HIGH) ? 1 : 0; - struct autoload_config config = { - CACHE_AUTOLOAD_POSITIVE, - autoload_trigger, - input1_enable, - input2_enable, - addr1, - size1, // autoload max size 0x03FFFFFF - addr2, - size2, // autoload max size 0x03FFFFFF - }; - Cache_Config_DCache_Autoload(&config); - REG_SET_FIELD(EXTMEM_DCACHE_AUTOLOAD_CTRL_REG, EXTMEM_DCACHE_AUTOLOAD_SIZE, autoload_linesize); // default 0 - Cache_Enable_DCache_Autoload(); - // printf("autoload_start!\n"); - } -#endif -#endif -} - -void autoload_func(uint32_t addr1, uint32_t size1) -{ -#if CONFIG_IDF_TARGET_ESP32S3 -#if CONFIG_TIE728_BOOST - if (autoload_enable && (!preload_enable)) { - uint8_t input1_enable = (addr1 < SOC_EXTRAM_DATA_HIGH) ? 1 : 0; - - // config first region - struct autoload_region_config region0 = { - 0, // region - input1_enable, // ena - addr1, // addr - size1, // autoload region size 0-0x03FFFFFF - }; - Cache_Config_DCache_Region_Autoload(®ion0); - } -#else - if (autoload_enable && (!preload_enable)) { - Cache_Disable_DCache_Autoload(); - uint8_t input1_enable = (addr1 < SOC_EXTRAM_DATA_HIGH) ? 1 : 0; - struct autoload_config config = { - CACHE_AUTOLOAD_POSITIVE, - autoload_trigger, - input1_enable, - 0, - addr1, - size1, // autoload max size 0x03FFFFFF - addr1, - size1, // autoload max size 0x03FFFFFF - }; - Cache_Config_DCache_Autoload(&config); - REG_SET_FIELD(EXTMEM_DCACHE_AUTOLOAD_CTRL_REG, EXTMEM_DCACHE_AUTOLOAD_SIZE, autoload_linesize); // default 0 - Cache_Enable_DCache_Autoload(); - // printf("autoload_start!\n"); - } -#endif -#endif -} -} // namespace cache -} // namespace tool -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/include/dl_constant.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/include/dl_constant.hpp deleted file mode 100644 index c6ecff1c..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/include/dl_constant.hpp +++ /dev/null @@ -1,157 +0,0 @@ -#pragma once - -#include "dl_define.hpp" -#include "dl_tool.hpp" -#include -#include - -namespace dl { -typedef enum { - MEMORY_RELAYOUT_NO_CHANGE = 0, // Don't change layout - MEMORY_RELAYOUT_INTERTWINE_32_32 = - 1, // Forming a 64-bit data by interposing 32-bit all-zero data with 32-bit valid data. - MEMORY_RELAYOUT_MAX = MEMORY_RELAYOUT_INTERTWINE_32_32, -} memory_relayout_type_t; - -/** - * @brief Base class of Filter, Bias, Activation. - * - * @tparam T supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize, - * - int8_t: stands for operation in int8_t quantize. - */ -template -class Constant { -public: - const T *element; /**/ - const int exponent; /**/ - const std::vector shape; /**/ - const bool dynamic_alloc; /**/ - - /** - * @brief Construct a new Constant object. - * - * @param element point to element. - * @param exponent exponent of element. - * @param shape shape of Constant. - * @param dynamic_alloc whether to dynamically allocate memory.. - */ - Constant(const T *element, - const int exponent, - const std::vector shape, - const bool dynamic_alloc = false, - const dl::memory_relayout_type_t memory_relayout = dl::MEMORY_RELAYOUT_NO_CHANGE); - - /** - * @brief destruction a Constant object. - * - */ - virtual ~Constant(); -}; - -/** - * @brief Filter. - * NOTE: The shape format of filter is fixed, but the element sequence depands on optimization method. - * - 1D: reserved - * - 2D: shape format is [filter_height, filter_width, input_channel, output_channel]. dilation format is [height, - * width] - * - * @tparam T supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize, - * - int8_t: stands for operation in int8_t quantize. - */ -template -class Filter : public Constant { -public: - const std::vector dilation; /**/ - /**/ - std::vector shape_with_dilation; /**/ - /**/ - const int8_t *channel_exponent; /**/ - const int channel_exponent_size; - - /** - * @brief Construct a new Filter object. - * - * @param element point to element - * @param exponent exponent of element - * @param shape shape of Filter, - * - 1D: reserved - * - 2D: for convolution is [filter_height, filter_width, input_channel, output_channel], - * for depthwise convolution is [filter_height, filter_width, input_channel, 1] - * @param dilation dilation of Filter - * - 1D: reserved - * - 2D: [dilation_in_height, dilation_in_width] - */ - Filter(const T *element, - const int exponent, - const std::vector shape, - const std::vector dilation = {1, 1}, - const bool dynamic_alloc = false); - - /** - * @brief Construct a new Filter object. it is only avaliable to int16_t - * - * @param element point to element - * @param channel_exponent exponent for per-channel - * @param channel_exponent_size size of exponent - * @param shape shape of element - * @param dilation dilation of Filter - * - 1D: reserved - * - 2D: [dilation_in_height, dilation_in_width] - */ - Filter(const T *element, - const int8_t *channel_exponent, - const int channel_exponent_size, - const std::vector shape, - const std::vector dilation = {1, 1}); - - /** - * @brief Print the n-th filter. - * - * @param n index of output_channel - * @param message to print - */ - void print2d_n(const int n, const char *message) const; -}; - -/** - * @brief Bias. - * - * @tparam T supports int16_t and int8_t - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -template -class Bias : public Constant { -public: - using Constant::Constant; -}; - -/** - * @brief Activation. - * - * @tparam T supports int16_t and int8_t - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -template -class Activation : public Constant { -public: - const activation_type_t type; /* shape = {0}, - const bool dynamic_alloc = false); -}; -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/include/dl_tensor_base.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/include/dl_tensor_base.hpp deleted file mode 100644 index af5f393e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/include/dl_tensor_base.hpp +++ /dev/null @@ -1,260 +0,0 @@ -#pragma once - -#include "dl_tool.hpp" -#include "esp_log.h" -#include -#include -#include -#include -#include -#include - -namespace dl { - -/** - * @brief The data type of esp-dl is same as flatbuffer's data type. - */ -typedef enum { - DATA_TYPE_UNDEFINED = 0, - DATA_TYPE_FLOAT = 1, - DATA_TYPE_UINT8 = 2, - DATA_TYPE_INT8 = 3, - DATA_TYPE_UINT16 = 4, - DATA_TYPE_INT16 = 5, - DATA_TYPE_INT32 = 6, - DATA_TYPE_INT64 = 7, - DATA_TYPE_STRING = 8, - DATA_TYPE_BOOL = 9, - DATA_TYPE_FLOAT16 = 10, - DATA_TYPE_DOUBLE = 11, - DATA_TYPE_UINT32 = 12, - DATA_TYPE_UINT64 = 13, - DATA_TYPE_MIN = DATA_TYPE_UNDEFINED, - DATA_TYPE_MAX = DATA_TYPE_UINT64 -} dtype_t; - -/** - * quantize float data into integer data - */ -int quantize(float input, float scale, float quant_min, float quant_max); - -/** - * @brief dequantize integer data into float data - */ -float dequantize(int input, float scale); - -/** - * @brief Return the bytes of data type - */ -size_t dtype_sizeof(dtype_t dtype); - -/** - * @brief Return the data type string - */ -const char *dtype_to_string(dtype_t dtype); - -/** - * @brief Return acivation type string - */ -const char *activation_type_to_string(activation_type_t type); - -/** - * @brief Return quant type string - */ -const char *quant_type_to_string(quant_type_t type); - -/** - * @brief Convert shape(vector) to string - */ -std::string shape_to_string(std::vector shape); - -/** - * @brief This class is designed according to PyTorch Tensor. - * TensorBase is required to ensure that the first address are aligned to 16 bytes and the memory size should be a - * multiple of 16 bytes. - * - * TODO:: Implement more functions - */ -class TensorBase { -public: - int size; /* shape; /* axis_offset; /*size = 0; - this->shape = {}; - this->dtype = DATA_TYPE_FLOAT; - this->exponent = 0; - this->auto_free = true; - this->axis_offset = {}; - this->data = nullptr; - this->cache = nullptr; - this->caps = MALLOC_CAP_8BIT; - } - - TensorBase(std::vector shape, - const void *element, - int exponent = 0, - dtype_t dtype = DATA_TYPE_FLOAT, - bool deep = true, - uint32_t caps = MALLOC_CAP_8BIT); - - virtual ~TensorBase() - { - if (this->auto_free) { - heap_caps_free(this->data); - } - } - - /** - * @brief Assign tensor to this tensor - * - * @param tensor - * - * @return ture if assign successfully, otherwise false. - */ - bool assign(TensorBase *tensor); - - /** - * @brief Assign data to this tensor - * - * @param shape - * @param element - * @param exponent - * @param dtype - * - * @return ture if assign successfully, otherwise false. - */ - bool assign(std::vector shape, const void *element, int exponent, dtype_t dtype); - - /** - * @brief Get the size of Tensor. - * - * @return the size of Tensor. - */ - int get_size() { return this->size; } - - /** - * @brief Get the aligned size of Tensor. - * - * @return the aligned size of Tensor. - */ - int get_aligned_size() - { - int align = 16 / this->get_dtype_bytes(); - return this->size % align == 0 ? this->size : this->size + align - this->size % align; - } - - /** - * @brief Get the dtype size, in bytes. - * - * @return the size of dtype. - */ - size_t get_dtype_bytes() { return dtype_sizeof(this->dtype); } - - /** - * @brief Get the dtype string of Tensor. - * - * @return the string of Tensor's dtype. - */ - const char *get_dtype_string() { return dtype_to_string(this->dtype); } - - /** - * @brief Get the bytes of Tensor. - * - * @return the bytes of Tensor. - */ - int get_bytes() { return this->size * this->get_dtype_bytes(); } - - /** - * @brief Get element pointer. If cache(preload data pointer) is not null, return cache pointer, otherwise return - * data pointer. - * - * @return the pointer of Tensor's element - */ - virtual void *get_element_ptr() - { - if (this->cache) { - return this->cache; // If preload cache is not null, use this pointer - } - - return this->data; - } - - /** - * @brief Get the index of each dims - * - * @param element_index the index of the element - * @return std::vector the index of each dims - */ - virtual std::vector get_axis_index(int element_index); - - virtual void preload() - { - if (this->cache) { - tool::copy_memory(this->cache, this->cache, this->get_bytes()); - } - } - - /** - * @brief Set the shape of Tensor. - * @param shape the shape of Tensor. - * @return Tensor. - */ - TensorBase &set_shape(const std::vector shape); - - std::vector get_shape() { return this->shape; } - - dtype_t get_dtype() { return this->dtype; } - - size_t set_preload_addr(void *addr, size_t size); - - void reset_bias_layout(quant_type_t op_quant_type, bool is_depthwise); - - /** - * @brief Change a new shape to the Tensor without changing its data. - * - * @param shape the target shape - * @return TensorBase& self - */ - TensorBase &reshape(std::vector shape); - - template - TensorBase *transpose(T *input_element, - std::vector &input_shape, - std::vector &input_axis_offset, - std::vector &perm); - - /** - * @brief Reverse or permute the axes of the input Tensor - * - * @param input the input Tensor - * @param perm the new arangement of the dims. if perm == {}, the dims arangement will be reversed. - * @return TensorBase *self - */ - TensorBase *transpose(TensorBase *input, std::vector perm = {}); - - /** - * @brief Get the index of element - * - * @param axis_index the index of each dims - * @return int the index of element - */ - int get_element_index(const std::vector axis_index); - - // int& operator[](size_t index) { - // if (index >= this->size) { - // throw std::out_of_range("Index out of range"); - // } - // return data[index]; - // } -}; -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/include/dl_variable.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/include/dl_variable.hpp deleted file mode 100644 index 7109fd78..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/include/dl_variable.hpp +++ /dev/null @@ -1,605 +0,0 @@ -#pragma once - -#include "dl_tensor_base.hpp" -#include "esp_heap_caps.h" -#include -namespace dl { - -/** - * @brief Tensor - * - * @tparam T support int8_t, int16_t and float. - */ -template -class Tensor : public TensorBase { -private: - /** - * @brief Get dtype from Template type - * - */ - dtype_t get_template_dtype() - { - if (std::is_same::value) { - return DATA_TYPE_INT8; - } else if (std::is_same::value) { - return DATA_TYPE_INT16; - } else if (std::is_same::value) { - return DATA_TYPE_FLOAT; - } else if (std::is_same::value) { - return DATA_TYPE_INT32; - } else { - return DATA_TYPE_UNDEFINED; - } - return DATA_TYPE_UNDEFINED; - } - -public: - T *element; /* shape; /*auto_free = true, this->exponent = 0; - this->set_shape({0}); - this->dtype = this->get_template_dtype(); - } - - /** - * @brief Construct a new Tensor object by copying from input. - * - * @param input an input Tensor - * @param deep one of true or false - * - true: apply a new memory, copy value from input.element to this new memory - * - false: take over input.element to this->element - */ - Tensor(Tensor &input, bool deep) : TensorBase() - { - this->auto_free = input.auto_free; - this->exponent = input.exponent; - this->set_shape(input.shape); - this->dtype = input.dtype; - if (deep && (input.element != NULL)) { - int size_real = input.get_size(); - T *new_element = (T *)tool::calloc_aligned(size_real, sizeof(T), 16, MALLOC_CAP_8BIT); - tool::copy_memory(new_element, input.element, size_real * sizeof(T)); - this->element = new_element; - } else { - this->element = input.element; - this->auto_free = false; - } - } - - /** - * @brief Construct a new Tensor object by element and shape. - * - */ - Tensor(std::vector shape, - T *element = nullptr, - int exponent = 0, - bool deep = true, - uint32_t caps = MALLOC_CAP_8BIT) : - TensorBase() - { - this->set_shape(shape); - this->exponent = exponent; - this->dtype = this->get_template_dtype(); - if (element) { - if (deep) { - this->auto_free = true; - this->element = (T *)tool::malloc_aligned(this->get_size(), sizeof(T), 16, caps); - tool::copy_memory(this->element, element, this->get_size() * sizeof(T)); - } else { - this->auto_free = false; - this->element = element; - } - } else { - this->auto_free = true; - this->element = (T *)tool::malloc_aligned(this->get_size(), sizeof(T), 16, caps); - } - } - - /** - * @brief Construct a new Tensor object by element and shape. - * - */ - Tensor( - std::vector shape, const T *element, int exponent = 0, bool deep = true, uint32_t caps = MALLOC_CAP_8BIT) : - TensorBase() - { - this->set_shape(shape); - this->exponent = exponent; - this->dtype = this->get_template_dtype(); - if (element) { - if (deep) { - this->auto_free = true; - this->element = (T *)tool::malloc_aligned(this->get_size(), sizeof(T), 16, caps); - tool::copy_memory(this->element, const_cast(element), this->get_size() * sizeof(T)); - } else { - this->auto_free = false; - this->element = const_cast(element); - } - } else { - this->auto_free = true; - this->element = (T *)tool::malloc_aligned(this->get_size(), sizeof(T), 16, caps); - } - } - - /** - * @brief Destroy the Tensor object - * - */ - ~Tensor() - { - if (this->auto_free) - this->free_element(); - } - - /** - * @brief copy the element of the input Tensor. - * - * @param input an input Tensor - * @param deep one of true or false - * - true: apply a new memory, copy value from input.element to this new memory - * - false: take over input.element to this->element - * @return Tensor& self - */ - Tensor ©_element(Tensor &input, bool deep) - { - assert(this->get_size() == input.get_size()); - assert(input.element != NULL); - - this->malloc_element(); - if (deep) { - tool::copy_memory(this->element, input.element, this->get_size() * sizeof(T)); - } else { - this->element = input.element; - this->auto_free = false; - } - return *this; - } - - /** - * @brief Set the auto free object. - * - * @param auto_free one of true or false - * - true: free element when object destroyed - * - false: do not - * @return self - */ - Tensor &set_auto_free(const bool auto_free) - { - this->auto_free = auto_free; - return *this; - } - - /** - * @brief Set the element. - * - * @param element point to element memory - * @return self - */ - Tensor &set_element(T *element, const bool auto_free = false) - { - // assert(this->element == NULL); - this->element = element; - this->auto_free = auto_free; - - return *this; - } - - /** - * @brief Set the exponent. - * - * @param exponent exponent of element - * @return self - */ - Tensor &set_exponent(const int exponent) - { - this->exponent = exponent; - - return *this; - } - - /** - * @brief Set the shape of Tensor. - * - * @param shape the target shape - * - * @return self - */ - Tensor &set_shape(const std::vector shape); - - /** - * @brief print the shape of the Tensor - * - */ - void print_shape() - { - if (this->shape.size()) { - printf("shape = ("); - for (int i = 0; i < this->shape.size() - 1; i++) { - printf("%d, ", this->shape[i]); - } - printf("%d)\n", this->shape.back()); - } else { - printf("shape = ()\n"); - } - } - - /** - * @brief flatten the Tensor - * - * @return Tensor& self - */ - Tensor &flatten(); - - /** - * @brief Change a new shape to the Tensor without changing its data. - * - * @param shape the target shape - * @return Tensor& self - */ - Tensor &reshape(std::vector shape); - - /** - * @brief Remove dims with length==1 from Tensor - * - * @param axis the dim to to be remove. make sure the length of the dim is equal to 1. - * if axis == INT32_MAX, all the dims with length==1 will be removed. - * @return Tensor& self - */ - Tensor &squeeze(int axis = INT32_MAX); - - /** - * @brief Insert a new dim that will appear at the axis position in the expanded Tensor shape. - * - * @param axis the dim to be inserted - * @return Tensor& self - */ - Tensor &expand_dims(int axis); - - /** - * @brief Insert a new dim that will appear at the axis position in the expanded Tensor shape. - * - * @param axis the dim to be inserted - * @return Tensor& self - */ - Tensor &expand_dims(std::vector axis); - - /** - * @brief Reverse or permute the axes of the Tensor - * - * @param perm the new arangement of the dims. if perm == {}, the dims arangement will be reversed. - * @return Tensor& self - */ - Tensor &transpose(std::vector perm = {}); - - /** - * @brief Reverse or permute the axes of the input Tensor - * - * @param input the input Tensor - * @param perm the new arangement of the dims. if perm == {}, the dims arangement will be reversed. - * @return Tensor& self - */ - Tensor &transpose(Tensor &input, std::vector perm = {}); - - /** - * @brief Get the element pointer. - * - * @return pointer to memory - */ - void *get_element_ptr() { return this->element; } - - /** - * @brief Get the element value. - * - * @param index the index of each dim. - * @return T element value - */ - T get_element_value(const std::vector index) { return this->element[this->get_element_index(index)]; } - - /** - * @brief Get the element value. - * - * @param index the index of the element. - * @return T element value - */ - T get_element_value(int index) { return this->element[index]; } - - /** - * @brief Set the all the element to value. - * - * @param value target value - * @return Tensor& self - */ - Tensor &set_value(T value); - - /** - * @brief Set the the element to value - * - * @param value target value, it will be broadcast automatically. - * @return Tensor& self - */ - Tensor &set_value(Tensor &value); - - /** - * @brief Set the sliced element to value - * - * @param axis_index_range range of slices - * @param value target value - * @return Tensor& self - */ - Tensor &set_value(std::vector axis_index_range, T value); - - /** - * @brief Set the sliced element to value - * - * @param axis_index_range range of slices - * @param value target value, it will be broadcast automatically. - * @return Tensor& self - */ - Tensor &set_value(std::vector axis_index_range, Tensor &value); - - /** - * @brief Extracts a slice from the Tensor. - * - * @param axis_index_range range of slices - * @return Tensor output - */ - Tensor slice(std::vector axis_index_range); - - /** - * @brief Reverses specific dims of the tensor. - * - * @param axis The dims to be reversed - * @return Tensor& - */ - Tensor &reverse(std::vector axis); - - /** - * @brief Get the axis offset - * - * @return std::vector the axis offset - */ - std::vector get_axis_offset() { return this->axis_offset; } - - /** - * @brief Apply memory with zero-initialized only if this->element is NULL. - * - * @param auto_free one of true or false - * - true: free element when object destroyed - * - false: do not - * @return - * - true: on success - * - false: if applying failed - */ - bool calloc_element(const bool auto_free = true) - { - if (this->element != NULL) - return false; - - this->element = (T *)tool::calloc_aligned(this->get_size(), sizeof(T), 16, MALLOC_CAP_8BIT); - this->auto_free = auto_free; - - return true; - } - - /** - * @brief Apply memory without initialized only if this->element is NULL. - * - * @param auto_free one of true or false - * - true: free element when object destroyed - * - false: do not - * @return - * - true: on success - * - false: if applying failed - */ - bool malloc_element(const bool auto_free = true) - { - if (this->element != NULL) - return false; - - this->element = (T *)tool::malloc_aligned(this->get_size(), sizeof(T), 16, MALLOC_CAP_8BIT); - this->auto_free = auto_free; - - return true; - } - - /** - * @brief free element only if this->element != NULL - * set this->element to NULL, after free - * @brief Free element if this->element is not NULL. - */ - void free_element() - { - if (this->auto_free && this->element) { - tool::free_aligned(this->element); - this->element = NULL; - } - } - - /** - * @brief print the element of the tensor - * - * @param axis_index_range the element range of each dims to be print. if axis_index_range == {}, all the element - * will be print. - * @param message to print - */ - void print(std::vector axis_index_range = {}, const char *message = ""); - - /** - * @brief print all the element of the Tensor. - * - * @param message to print - */ - void print_all(const char *message = "") - { - std::cout << "\n" << message << " | "; - this->print_shape(); - - for (int i = 0; i < this->get_size(); i++) { - std::cout << this->element[i] << " "; - } - std::cout << "\n"; - return; - } - - /** - * @brief Get the index of element - * - * @param axis_index the index of each dims - * @return int the index of element - */ - int get_element_index(const std::vector axis_index); - - /** - * @brief Check the element value with input ground-truth. - * - * @param gt_element ground-truth value of element - * @param bias permissible error - * @param info one of true or false - * - true: shape and result - * - false: do not - * @param failed_number maximum number of wrong element that will be printed - * - * @return - * - true: in permissible error - * - false: not - */ - bool check_element(T *gt_element, int bias = 2, bool info = true, int failed_number = 0) - { - int count = 0; - if (info) - this->print_shape(); - int size = this->get_size(); - for (int i = 0; i < size; i++) { - if (DL_ABS(this->element[i] - gt_element[i]) > bias) { - std::vector index = get_axis_index(i); - std::cout << "element["; - for (int j = 0; j < index.size() - 1; j++) { - std::cout << index[j] << ", "; - } - std::cout << index.back() << "]: "; - std::cout << +this->element[i] << " v.s. " << +gt_element[i] << "\n"; - count++; - if (count > failed_number) - return false; - } - } - if (count) - return false; - - if (info) - printf("PASS\n"); - - return true; - } - - /** - * @brief Check the shape is the same as the shape of input. - * - * @param input an input tensor - * @return - * - true: same shape - * - false: not - */ - bool is_same_shape(const TensorBase &input) - { - if (input.shape.size() != this->shape.size()) { - return false; - } - for (int i = 0; i < this->shape.size(); i++) { - if (input.shape[i] != this->shape[i]) { - return false; - } - } - return true; - } - - Tensor &operator=(const Tensor &input) - { - this->auto_free = input.auto_free; - this->exponent = input.exponent; - int size_real_tmp = this->size; - int size_input_real = input.size; - this->set_shape(input.shape); - if (input.element) { - if (this->element) { - if (size_real_tmp != size_input_real) { - tool::free_aligned(this->element); - T *new_element = (T *)tool::malloc_aligned(size_input_real, sizeof(T), 16, MALLOC_CAP_8BIT); - tool::copy_memory(new_element, input.element, size_input_real * sizeof(T)); - this->element = new_element; - } else { - tool::copy_memory(this->element, input.element, size_input_real * sizeof(T)); - } - } else { - T *new_element = (T *)tool::malloc_aligned(size_input_real, sizeof(T), 16, MALLOC_CAP_8BIT); - tool::copy_memory(new_element, input.element, size_input_real * sizeof(T)); - this->element = new_element; - } - return *this; - } else { - if (this->element) { - tool::free_aligned(this->element); - this->element = NULL; - } - return *this; - } - } - - static Tensor arange(int size) - { - Tensor output; - output.set_auto_free(true).set_exponent(0).set_shape({size}).malloc_element(); - for (int i = 0; i < size; ++i) { - output.element[i] = i; - } - return output; - } - - /** - * @brief Copy element from input. - * If the dtype of input is different from this tensor, convert the element of input to the dtype of this tensor and - * copy them. If this tensor is empty, malloc the element and copy input element. - * - * @param input an input tensor - * @return - * - true: same shape - * - false: not - */ - bool convert_from(TensorBase *input); - bool convert_from(const Tensor &input); - bool convert_from(const Tensor &input); - bool convert_from(const Tensor &input); - - // /** - // * @brief Copy element to dest. - // * If the dtype of dest is different from this tensor, convert the element of this tensor to the dtype of dest - // and copy them. - // * - // * @param dest an input tensor - // * @return - // * - true: same shape - // * - false: not - // */ - // bool convert_to(Tensor &dest); - // bool convert_to(Tensor &dest); - // bool convert_to(Tensor &dest); - - // /** - // * @brief Quantize input: round(clip((in * scale ), quant_min, quant_max)) - // * - // */ - // static float quantize(float input, float scale, float quant_min, float quant_max); - - // /** - // * @brief Dequantize input: input * scale - // */ - // static float dequantize(int input, float scale); -}; -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/src/dl_constant.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/src/dl_constant.cpp deleted file mode 100644 index 83de3b4e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/src/dl_constant.cpp +++ /dev/null @@ -1,158 +0,0 @@ -#include -#include -#include -#include - -#include "dl_constant.hpp" - -using namespace std; - -namespace dl { -template -Constant::Constant(const T *element, - const int exponent, - const vector shape, - const bool dynamic_alloc, - const dl::memory_relayout_type_t memory_relayout) : - element(element), exponent(exponent), shape(shape), dynamic_alloc(dynamic_alloc) -{ - if (this->dynamic_alloc) { - int size = 1; - int alloc_size = 1; - for (int i = 0; i < shape.size(); ++i) { - assert(shape[i] >= 0); - size *= shape[i]; - } - // Apply for more memory to avoid reading data from illegal addresses when loading data into vector registers. - if ((sizeof(T) == 1 || sizeof(T) == 4) && (size & 15)) { - alloc_size = ((size >> 4) + 1) << 4; - } else if (sizeof(T) == 2 && (size & 7)) { - alloc_size = ((size >> 3) + 1) << 3; - } else { - alloc_size = size; - } - - if (memory_relayout == dl::MEMORY_RELAYOUT_INTERTWINE_32_32 && sizeof(T) == 4) { - this->element = (T *)tool::calloc_aligned(alloc_size, sizeof(T) * 2, 16, MALLOC_CAP_8BIT); - - const int32_t *input_element_tmp = reinterpret_cast(element); - int64_t *element_tmp = reinterpret_cast(const_cast(this->element)); - for (int i = 0; i < size; i++) { - element_tmp[i] = input_element_tmp[i]; - } - } else { - this->element = (T *)tool::calloc_aligned(alloc_size, sizeof(T), 16, MALLOC_CAP_8BIT); - tool::copy_memory(const_cast(this->element), const_cast(element), size * sizeof(T)); - } - } -} -template Constant::Constant(const int16_t *element, - const int exponent, - const vector shape, - const bool dynamic_alloc, - const dl::memory_relayout_type_t memory_relayout); -template Constant::Constant(const int8_t *element, - const int exponent, - const vector shape, - const bool dynamic_alloc, - const dl::memory_relayout_type_t memory_relayout); -template Constant::Constant(const int32_t *element, - const int exponent, - const vector shape, - const bool dynamic_alloc, - const dl::memory_relayout_type_t memory_relayout); - -template -Constant::~Constant() -{ - if (this->dynamic_alloc) { - if (this->element) { - tool::free_aligned(const_cast(this->element)); - } - } -} -template Constant::~Constant(); -template Constant::~Constant(); -template Constant::~Constant(); - -template -Filter::Filter(const T *element, - const int exponent, - const std::vector shape, - const std::vector dilation, - const bool dynamic_alloc) : - Constant(element, exponent, shape, dynamic_alloc), dilation(dilation), channel_exponent_size(1) -{ - this->shape_with_dilation = shape; - for (int i = 0; i < dilation.size(); i++) this->shape_with_dilation[i] = (shape[i] - 1) * dilation[i] + 1; -} -template Filter::Filter(const int16_t *element, - const int exponent, - const std::vector shape, - const std::vector dilation, - const bool dynamic_alloc); -template Filter::Filter(const int8_t *element, - const int exponent, - const std::vector shape, - const std::vector dilation, - const bool dynamic_alloc); - -template -Filter::Filter(const T *element, - const int8_t *channel_exponent, - const int channel_exponent_size, - const std::vector shape, - const std::vector dilation) : - Constant(element, INT_MIN, shape), - dilation(dilation), - channel_exponent(channel_exponent), - channel_exponent_size(channel_exponent_size) -{ - this->shape_with_dilation = shape; - for (int i = 0; i < dilation.size(); i++) this->shape_with_dilation[i] = (shape[i] - 1) * dilation[i] + 1; -} -template Filter::Filter(const int8_t *element, - const int8_t *channel_exponent, - const int channel_exponent_size, - const std::vector shape, - const std::vector dilation); - -template -void Filter::print2d_n(const int n, const char *message) const -{ - printf("%s\n", message); - - for (int y = 0; y < this->shape[0]; y++) { - for (int x = 0; x < this->shape[1]; x++) { - printf("("); - for (size_t c = 0; c < this->shape[2]; c++) { - printf("%7d", this->element[((y * this->shape[1] + x) * this->shape[2] + c) * this->shape[3] + n]); - } - printf(")"); - } - printf("\n"); - } -} -template void Filter::print2d_n(const int n, const char *message) const; -template void Filter::print2d_n(const int n, const char *message) const; - -template -Activation::Activation(const activation_type_t type, - const T *element, - const int exponent, - const vector shape, - const bool dynamic_alloc) : - Constant(element, exponent, shape, dynamic_alloc), type(type) -{ -} -template Activation::Activation(const activation_type_t type, - const int16_t *element, - const int exponent, - const vector shape, - const bool dynamic_alloc); -template Activation::Activation(const activation_type_t type, - const int8_t *element, - const int exponent, - const vector shape, - const bool dynamic_alloc); -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/src/dl_tensor_base.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/src/dl_tensor_base.cpp deleted file mode 100644 index 985b9d9e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/src/dl_tensor_base.cpp +++ /dev/null @@ -1,544 +0,0 @@ -#include "dl_tensor_base.hpp" - -namespace dl { -int quantize(float input, float scale, float quant_min, float quant_max) -{ - int output = tool::round(input * scale); - output = DL_CLIP(output, quant_min, quant_max); - return output; -} - -float dequantize(int input, float scale) -{ - float output = input * scale; - return output; -} - -size_t dtype_sizeof(dtype_t dtype) -{ - switch (dtype) { - case DATA_TYPE_FLOAT: - return sizeof(float); - case DATA_TYPE_INT8: - return sizeof(int8_t); - case DATA_TYPE_UINT8: - return sizeof(uint8_t); - case DATA_TYPE_INT16: - return sizeof(int16_t); - case DATA_TYPE_UINT16: - return sizeof(uint16_t); - case DATA_TYPE_INT32: - return sizeof(int32_t); - case DATA_TYPE_UINT32: - return sizeof(uint32_t); - case DATA_TYPE_INT64: - return sizeof(int64_t); - case DATA_TYPE_UINT64: - return sizeof(uint64_t); - case DATA_TYPE_BOOL: - return sizeof(bool); - case DATA_TYPE_DOUBLE: - return sizeof(double); - case DATA_TYPE_FLOAT16: - return 2; - default: - return 1; - } - return 1; -} - -const char *dtype_to_string(dtype_t dtype) -{ - switch (dtype) { - case DATA_TYPE_FLOAT: - return "float"; - case DATA_TYPE_UINT8: - return "uint8"; - case DATA_TYPE_INT8: - return "int8"; - case DATA_TYPE_UINT16: - return "uint16"; - case DATA_TYPE_INT16: - return "int16"; - case DATA_TYPE_INT32: - return "int32"; - case DATA_TYPE_UINT32: - return "unit32"; - case DATA_TYPE_DOUBLE: - return "double"; - case DATA_TYPE_STRING: - return "string"; - case DATA_TYPE_BOOL: - return "bool"; - case DATA_TYPE_FLOAT16: - return "float16"; - case DATA_TYPE_INT64: - return "int64"; - case DATA_TYPE_UINT64: - return "uint64"; - case DATA_TYPE_UNDEFINED: - return "undefined"; - default: - return "undefined"; - } - return "undefined"; -} - -const char *activation_type_to_string(activation_type_t type) -{ - switch (type) { - case Linear: - return "None"; - case ReLU: - return "ReLU"; - case LeakyReLU: - return "LeakyReLU"; - case PReLU: - return "PReLU"; - default: - return "None"; - } - return "None"; -} - -const char *quant_type_to_string(quant_type_t type) -{ - switch (type) { - case QUANT_TYPE_SYMM_8BIT: - return "symm 8bit"; - case QUANT_TYPE_SYMM_16BIT: - return "symm 16bit"; - case QUANT_TYPE_SYMM_32BIT: - return "symm 32bit"; - case QUANT_TYPE_FLOAT32: - return "float"; - default: - return "None"; - } - return "None"; -} - -std::string shape_to_string(std::vector shape) -{ - if (shape.size() == 0) { - return "[]"; - } - - std::string str = "["; - for (int i = 0; i < shape.size(); i++) { - str += std::to_string(shape[i]); - if (i != shape.size() - 1) { - str += ", "; - } - } - str += "]"; - return str; -} - -TensorBase::TensorBase( - std::vector shape, const void *element, int exponent, dtype_t dtype, bool deep, uint32_t caps) -{ - this->set_shape(shape); - this->exponent = exponent; - this->dtype = dtype; - this->cache = nullptr; - size_t dtype_bytes = this->get_dtype_bytes(); - size_t aligned_size = this->get_aligned_size(); - if (element) { - if (deep) { - this->auto_free = true; - this->data = tool::calloc_aligned(aligned_size, dtype_bytes, 16, caps); - tool::copy_memory(this->data, const_cast(element), this->get_size() * dtype_bytes); - } else { - this->auto_free = false; - this->data = const_cast(element); - } - } else { - this->auto_free = true; - this->data = tool::calloc_aligned(aligned_size, dtype_bytes, 16, caps); - } - this->caps = caps; -} - -bool TensorBase::assign(TensorBase *tensor) -{ - if (tensor == nullptr || this->get_size() != tensor->get_size()) { - return false; - } - - if (this->exponent == tensor->exponent && this->dtype == tensor->dtype) { - tool::copy_memory(this->data, tensor->data, this->get_bytes()); - } else if (tensor->dtype == DATA_TYPE_FLOAT) { - float *src_data = (float *)tensor->data; - float scale = 1.0 / (DL_SCALE(this->exponent)); - - if (this->dtype == DATA_TYPE_INT8) { - int8_t *data = (int8_t *)this->data; - for (int i = 0; i < this->get_size(); i++) { - data[i] = static_cast(quantize(src_data[i], scale, DL_QUANT8_MIN, DL_QUANT8_MAX)); - } - } else if (this->dtype == DATA_TYPE_INT16) { - int16_t *data = (int16_t *)this->data; - for (int i = 0; i < this->get_size(); i++) { - data[i] = static_cast(quantize(src_data[i], scale, DL_QUANT16_MIN, DL_QUANT16_MAX)); - } - } else { - return false; - } - } else if (this->dtype == DATA_TYPE_FLOAT) { - float *data = (float *)this->data; - float scale = DL_SCALE(tensor->exponent); - - if (tensor->dtype == DATA_TYPE_INT8 || tensor->dtype == DATA_TYPE_UINT8) { - int8_t *src_data = (int8_t *)tensor->data; - for (int i = 0; i < this->get_size(); i++) { - data[i] = dequantize(src_data[i], scale); - } - } else if (tensor->dtype == DATA_TYPE_INT16 || tensor->dtype == DATA_TYPE_UINT16) { - int16_t *src_data = (int16_t *)tensor->data; - for (int i = 0; i < this->get_size(); i++) { - data[i] = dequantize(src_data[i], scale); - } - } else { - return false; - } - } else if (this->exponent != tensor->exponent || this->dtype != tensor->dtype) { - // quantize(dequtize()) - if (this->exponent == tensor->exponent) { - if (this->dtype == DATA_TYPE_INT8 && tensor->dtype == DATA_TYPE_INT16) { - int16_t *src_data = static_cast(tensor->data); - int8_t *data = static_cast(this->data); - for (int i = 0; i < this->get_size(); i++) { - data[i] = static_cast(DL_CLIP(src_data[i], DL_QUANT8_MIN, DL_QUANT8_MAX)); - } - } else if (this->dtype == DATA_TYPE_INT16 && tensor->dtype == DATA_TYPE_INT8) { - int8_t *src_data = static_cast(tensor->data); - int16_t *data = static_cast(this->data); - for (int i = 0; i < this->get_size(); i++) { - data[i] = static_cast(src_data[i]); - } - } else { - return false; - } - } else { - float src_scale = DL_SCALE(tensor->exponent); - float scale = 1.0 / (DL_SCALE(this->exponent)); - - if (this->dtype == DATA_TYPE_INT8 && tensor->dtype == DATA_TYPE_INT8) { - int8_t *src_data = static_cast(tensor->data); - int8_t *data = static_cast(this->data); - for (int i = 0; i < this->get_size(); i++) { - float tmp = dequantize(src_data[i], src_scale); - data[i] = static_cast(quantize(tmp, scale, DL_QUANT8_MIN, DL_QUANT8_MAX)); - } - } else if (this->dtype == DATA_TYPE_INT16 && tensor->dtype == DATA_TYPE_INT16) { - int16_t *src_data = static_cast(tensor->data); - int16_t *data = static_cast(this->data); - for (int i = 0; i < this->get_size(); i++) { - float tmp = dequantize(src_data[i], src_scale); - data[i] = static_cast(quantize(tmp, scale, DL_QUANT16_MIN, DL_QUANT16_MAX)); - } - } else if (this->dtype == DATA_TYPE_INT8 && tensor->dtype == DATA_TYPE_INT16) { - int16_t *src_data = static_cast(tensor->data); - int8_t *data = static_cast(this->data); - for (int i = 0; i < this->get_size(); i++) { - float tmp = dequantize(src_data[i], src_scale); - data[i] = static_cast(quantize(tmp, scale, DL_QUANT8_MIN, DL_QUANT8_MAX)); - } - } else if (this->dtype == DATA_TYPE_INT16 && tensor->dtype == DATA_TYPE_INT8) { - int8_t *src_data = static_cast(tensor->data); - int16_t *data = static_cast(this->data); - for (int i = 0; i < this->get_size(); i++) { - float tmp = dequantize(src_data[i], src_scale); - data[i] = static_cast(quantize(tmp, scale, DL_QUANT16_MIN, DL_QUANT16_MAX)); - } - } else { - return false; - } - } - } else { - return false; - } - return true; -} - -bool TensorBase::assign(std::vector shape, const void *element, int exponent, dtype_t dtype) -{ - TensorBase tensor(shape, element, exponent, dtype, false); - return this->assign(&tensor); -} - -std::vector TensorBase::get_axis_index(int element_index) -{ - std::vector axis_index(this->shape.size(), 0); - for (int j = this->shape.size() - 1; j > -1; --j) { - axis_index[j] = element_index % this->shape[j]; - element_index /= this->shape[j]; - } - return axis_index; -} - -TensorBase &TensorBase::set_shape(const std::vector shape) -{ - assert(shape.size() > 0); - this->size = 1; - for (int i = 0; i < shape.size(); ++i) { - assert(shape[i] >= 1); - this->size *= shape[i]; - } - this->shape = shape; - - std::vector axis_offset(this->shape.size(), 1); - for (int i = shape.size() - 2; i > -1; --i) { - axis_offset[i] = axis_offset[i + 1] * this->shape[i + 1]; - } - this->axis_offset = axis_offset; - return *this; -} - -size_t TensorBase::set_preload_addr(void *addr, size_t size) -{ - size_t aligned_size = this->get_aligned_size(); - if (addr && size >= aligned_size) { - this->cache = addr; - return aligned_size; - } - this->cache = nullptr; - return 0; -} - -void TensorBase::reset_bias_layout(quant_type_t op_quant_type, bool is_depthwise) -{ - // The bias needs to be quantized to 32 bits. - assert(this->dtype == DATA_TYPE_INT32); - -#if CONFIG_IDF_TARGET_ESP32P4 - // Reset bias layout for esp32p4 - if (op_quant_type == QUANT_TYPE_SYMM_16BIT) { - this->dtype = DATA_TYPE_INT64; - size_t dtype_bytes = this->get_dtype_bytes(); - size_t aligned_size = this->get_aligned_size(); - - int32_t *pre_data = static_cast(this->data); - int64_t *cur_data = static_cast(tool::calloc_aligned(aligned_size, dtype_bytes, 16, this->caps)); - for (int i = 0; i < this->get_size(); i++) { - cur_data[i] = pre_data[i]; - } - heap_caps_free(this->data); - this->data = cur_data; - } -#elif CONFIG_IDF_TARGET_ESP32S3 - // Reset bias layout for esp32s3 - // 0x000AAAAA000BBBBB ==> 0xAAAAABBBBB - if (op_quant_type == QUANT_TYPE_SYMM_8BIT) { - size_t dtype_bytes = 1; - size_t align = 16 / dtype_bytes; - size_t data_num = this->get_size(); - size_t align_num = ((size_t)(data_num / align)) * align; - size_t remain_num = data_num - align_num; - if (is_depthwise) { - align_num = data_num; - remain_num = 0; - } - // QACC, EE.LD.QACC_L.L.128.IP / EE.LD.QACC_H.L.128.IP requires 16-byte address alignment. - // When the bias is stored with a size of 4 bytes, the address is exactly 16-byte aligned - // when used in EE.LD.QACC_H.L.128.IP, so the size of the aligned portion of memory here - // is calculated based on 4 bytes. - // ACCX, EE.LD.ACCX.IP requires 8-byte address alignment. - size_t memory_size_needed = align_num * 4 + remain_num * 8; - // get the aligned size - memory_size_needed = memory_size_needed % align == 0 ? memory_size_needed - : memory_size_needed + align - memory_size_needed % align; - int32_t *src_ptr = static_cast(this->data); - int8_t *dst_ptr = static_cast(tool::calloc_aligned(memory_size_needed, dtype_bytes, 16, this->caps)); - int8_t *dst_ptr_head = dst_ptr; - - // 0x000AAAAA000BBBBB ==> 0xAAAAABBBBB - int i = 0; - for (; i < align_num; i++) { - int32_t src_data = src_ptr[i] & 0xfffff; - if (i & 1) { - int8_t src_least_4bit = src_data & 0xf; - (*(--dst_ptr_head)) |= (src_least_4bit << 4); - src_data >>= 4; - } else { - *dst_ptr_head = src_data & 0xff; - src_data >>= 8; - } - dst_ptr_head++; - *(reinterpret_cast(dst_ptr_head)) = static_cast(src_data); - dst_ptr_head += 2; - - // Move to the 16-byte memory address alignment. - if (((i + 1) % (align >> 1) == 0) && (reinterpret_cast(dst_ptr_head) & 0xf)) { - dst_ptr_head = dst_ptr_head + 16 - (reinterpret_cast(dst_ptr_head) & 0xf); - } - } - - for (int j = 0; j < remain_num; j++, i++) { - (reinterpret_cast(dst_ptr_head))[j] = src_ptr[i]; - } - - heap_caps_free(this->data); - this->data = dst_ptr; - } else if (op_quant_type == QUANT_TYPE_SYMM_16BIT) { - // TODO: reset bias layout for esp32s3 s16 - } -#endif -} - -TensorBase &TensorBase::reshape(std::vector shape) -{ - int size_gt = this->get_size(); - int index = -1; - for (int i = 0; i < shape.size(); ++i) { - if (shape[i] == -1) { - assert(index == -1); - index = i; - } else { - assert(shape[i] > 0); - } - } - int size = 1; - if (index == -1) { - for (int i = 0; i < shape.size(); ++i) { - size *= shape[i]; - } - assert(size == size_gt); - this->set_shape(shape); - } else { - for (int i = 0; i < shape.size(); ++i) { - if (shape[i] > 0) { - size *= shape[i]; - } - } - assert((size_gt % size) == 0); - shape[index] = size_gt / size; - this->set_shape(shape); - } - return *this; -} - -template -TensorBase *TensorBase::transpose(T *input_element, - std::vector &input_shape, - std::vector &input_axis_offset, - std::vector &perm) -{ - if (perm.size() == 0) { - for (int i = shape.size() - 1; i >= 0; i--) { - perm.push_back(i); - } - } - int dims = perm.size(); - - for (int i = 0; i < dims; ++i) { - if (perm[i] < 0) - perm[i] = dims + perm[i]; - this->shape[i] = input_shape[perm[i]]; - } - - this->axis_offset[dims - 1] = 1; - for (int i = dims - 2; i > -1; --i) { - this->axis_offset[i] = this->axis_offset[i + 1] * this->shape[i + 1]; - } - T *output_element = (T *)this->get_element_ptr(); - - std::vector input_axis_index(dims); - if (dims == 4) { - uint32_t input_idx = 0, output_idx = 0; - for (int i = 0; i < input_shape[0]; i++) { - for (int j = 0; j < input_shape[1]; j++) { - for (int k = 0; k < input_shape[2]; k++) { - for (int l = 0; l < input_shape[3]; l++) { - input_axis_index = {i, j, k, l}; - input_idx = l + k * input_axis_offset[2] + j * input_axis_offset[1] + i * input_axis_offset[0]; - output_idx = input_axis_index[perm[3]] * this->axis_offset[3] + - input_axis_index[perm[2]] * this->axis_offset[2] + - input_axis_index[perm[1]] * this->axis_offset[1] + - input_axis_index[perm[0]] * this->axis_offset[0]; - output_element[output_idx] = input_element[input_idx]; - } - } - } - } - } else if (dims == 3) { - uint32_t input_idx = 0, output_idx = 0; - for (int i = 0; i < input_shape[0]; i++) { - for (int j = 0; j < input_shape[1]; j++) { - for (int k = 0; k < input_shape[2]; k++) { - input_axis_index = {i, j, k}; - input_idx = k + j * input_axis_offset[1] + i * input_axis_offset[0]; - output_idx = input_axis_index[perm[2]] * this->axis_offset[2] + - input_axis_index[perm[1]] * this->axis_offset[1] + - input_axis_index[perm[0]] * this->axis_offset[0]; - output_element[output_idx] = input_element[input_idx]; - } - } - } - } else if (dims == 2) { - uint32_t input_idx = 0, output_idx = 0; - for (int i = 0; i < input_shape[0]; i++) { - for (int j = 0; j < input_shape[1]; j++) { - input_axis_index = {i, j}; - input_idx = j + i * input_axis_offset[0]; - output_idx = - input_axis_index[perm[1]] * this->axis_offset[1] + input_axis_index[perm[0]] * this->axis_offset[0]; - output_element[output_idx] = input_element[input_idx]; - } - } - } else { - // for any dims - std::vector index_old(dims, 0); - for (int i = 0; i < size; ++i) { - int dim_div_value = i; - int index_new = 0; - for (int j = dims - 1; j > -1; --j) { - index_old[j] = dim_div_value % input_shape[j]; - dim_div_value /= input_shape[j]; - } - for (int j = dims - 1; j > -1; --j) { - index_new += index_old[perm[j]] * this->axis_offset[j]; - } - output_element[index_new] = input_element[i]; - } - } - - return this; -} - -TensorBase *TensorBase::transpose(TensorBase *input, std::vector perm) -{ - assert(this->get_size() == input->get_size()); - assert(this->dtype == input->dtype); - - if (this->dtype == DATA_TYPE_INT8) { - transpose((int8_t *)input->get_element_ptr(), input->shape, input->axis_offset, perm); - } else if (this->dtype == DATA_TYPE_UINT8) { - transpose((uint8_t *)input->get_element_ptr(), input->shape, input->axis_offset, perm); - } else if (this->dtype == DATA_TYPE_INT16) { - transpose((int16_t *)input->get_element_ptr(), input->shape, input->axis_offset, perm); - } else if (this->dtype == DATA_TYPE_INT32) { - transpose((int32_t *)input->get_element_ptr(), input->shape, input->axis_offset, perm); - } else if (this->dtype == DATA_TYPE_UINT16) { - transpose((uint16_t *)input->get_element_ptr(), input->shape, input->axis_offset, perm); - } else if (this->dtype == DATA_TYPE_INT32) { - transpose((uint32_t *)input->get_element_ptr(), input->shape, input->axis_offset, perm); - } else if (this->dtype == DATA_TYPE_FLOAT) { - transpose((float *)input->get_element_ptr(), input->shape, input->axis_offset, perm); - } - - return this; -} - -int TensorBase::get_element_index(const std::vector axis_index) -{ - assert(axis_index.size() == this->shape.size()); - int element_index = 0; - for (int i = 0; i < axis_index.size(); i++) { - element_index += axis_index[i] * this->axis_offset[i]; - } - return element_index; -} - -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/src/dl_variable.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/src/dl_variable.cpp deleted file mode 100644 index a03db4c2..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/dl/typedef/src/dl_variable.cpp +++ /dev/null @@ -1,982 +0,0 @@ -#include "dl_variable.hpp" -#include -#include -#include -#include - -using namespace std; -using namespace dl; - -namespace dl { - -template -Tensor &Tensor::set_shape(const vector shape) -{ - assert(shape.size() > 0); - this->size = 1; - for (int i = 0; i < shape.size(); ++i) { - assert(shape[i] >= 0); - this->size *= shape[i]; - } - this->shape = shape; - - std::vector axis_offset(this->shape.size(), 1); - for (int i = shape.size() - 2; i > -1; --i) { - axis_offset[i] = axis_offset[i + 1] * this->shape[i + 1]; - } - this->axis_offset = axis_offset; - return *this; -} -template Tensor &Tensor::set_shape(const vector shape); -template Tensor &Tensor::set_shape(const vector shape); -template Tensor &Tensor::set_shape(const vector shape); -template Tensor &Tensor::set_shape(const vector shape); -template Tensor &Tensor::set_shape(const vector shape); -template Tensor &Tensor::set_shape(const vector shape); - -template -Tensor &Tensor::flatten() -{ - this->set_shape({this->get_size()}); - return *this; -} -template Tensor &Tensor::flatten(); -template Tensor &Tensor::flatten(); -template Tensor &Tensor::flatten(); -template Tensor &Tensor::flatten(); -template Tensor &Tensor::flatten(); -template Tensor &Tensor::flatten(); - -template -Tensor &Tensor::reshape(vector shape) -{ - int size_gt = this->get_size(); - int index = -1; - for (int i = 0; i < shape.size(); ++i) { - if (shape[i] == -1) { - assert(index == -1); - index = i; - } else { - assert(shape[i] > 0); - } - } - int size = 1; - if (index == -1) { - for (int i = 0; i < shape.size(); ++i) { - size *= shape[i]; - } - assert(size == size_gt); - this->set_shape(shape); - } else { - for (int i = 0; i < shape.size(); ++i) { - if (shape[i] > 0) { - size *= shape[i]; - } - } - assert((size_gt % size) == 0); - shape[index] = size_gt / size; - this->set_shape(shape); - } - return *this; -} -template Tensor &Tensor::reshape(vector shape); -template Tensor &Tensor::reshape(vector shape); -template Tensor &Tensor::reshape(vector shape); -template Tensor &Tensor::reshape(vector shape); -template Tensor &Tensor::reshape(vector shape); -template Tensor &Tensor::reshape(vector shape); - -template -Tensor &Tensor::squeeze(int axis) -{ - vector new_shape = this->shape; - if (axis == INT32_MAX) { - auto iter = std::remove(new_shape.begin(), new_shape.end(), 1); - new_shape.erase(iter, new_shape.end()); - } else { - if (axis < 0) - axis = new_shape.size() + axis; - assert(axis >= 0); - assert(*(new_shape.begin() + axis) == 1); - new_shape.erase(new_shape.begin() + axis); - } - this->set_shape(new_shape); - return *this; -} -template Tensor &Tensor::squeeze(int axis); -template Tensor &Tensor::squeeze(int axis); -template Tensor &Tensor::squeeze(int axis); -template Tensor &Tensor::squeeze(int axis); -template Tensor &Tensor::squeeze(int axis); -template Tensor &Tensor::squeeze(int axis); - -template -Tensor &Tensor::expand_dims(int axis) -{ - if (axis < 0) - axis = this->shape.size() + axis; - assert(axis >= 0); - vector new_shape = this->shape; - new_shape.insert(new_shape.begin() + axis, 1); - this->set_shape(new_shape); - return *this; -} -template Tensor &Tensor::expand_dims(int axis); -template Tensor &Tensor::expand_dims(int axis); -template Tensor &Tensor::expand_dims(int axis); -template Tensor &Tensor::expand_dims(int axis); -template Tensor &Tensor::expand_dims(int axis); -template Tensor &Tensor::expand_dims(int axis); - -template -Tensor &Tensor::expand_dims(vector axis) -{ - assert(axis.size() > 0); - int size_new = axis.size() + this->shape.size(); - for (int i = 0; i < axis.size(); i++) { - if (axis[i] < 0) { - axis[i] = size_new + axis[i]; - } - assert((axis[i] >= 0) && (axis[i] < size_new)); - } - sort(axis.begin(), axis.end()); - - for (int i = 1; i < axis.size(); i++) { - assert(axis[i] != axis[i - 1]); - } - - vector new_shape(size_new, 1); - int axis_index = 0; - int shape_index = 0; - for (int i = 0; i < size_new; i++) { - if (axis[axis_index] > i) { - new_shape[i] = this->shape[shape_index++]; - } else { - axis_index++; - } - } - this->set_shape(new_shape); - return *this; -} -template Tensor &Tensor::expand_dims(vector axis); -template Tensor &Tensor::expand_dims(vector axis); -template Tensor &Tensor::expand_dims(vector axis); -template Tensor &Tensor::expand_dims(vector axis); -template Tensor &Tensor::expand_dims(vector axis); -template Tensor &Tensor::expand_dims(vector axis); - -template -Tensor &Tensor::transpose(vector perm) -{ - if (perm.size() == 0) { - for (int i = this->shape.size() - 1; i >= 0; i--) { - perm.push_back(i); - } - } - assert(perm.size() == this->shape.size()); - assert(this->element != NULL); - int dims = perm.size(); - int size = this->get_size(); - Tensor temp(*this, true); - temp.set_auto_free(true); - - vector index_old(dims, 0); - vector new_shape(dims, 0); - - for (int i = 0; i < dims; ++i) { - if (perm[i] < 0) - perm[i] = dims + perm[i]; - assert((perm[i] >= 0) && (perm[i] < dims)); - new_shape[i] = this->shape[perm[i]]; - } - - this->axis_offset[dims - 1] = 1; - - for (int i = dims - 2; i > -1; --i) { - this->axis_offset[i] = this->axis_offset[i + 1] * new_shape[i + 1]; - } - - for (int i = 0; i < size; ++i) { - int dim_div_value = i; - int index_new = 0; - for (int j = dims - 1; j > -1; --j) { - index_old[j] = dim_div_value % this->shape[j]; - dim_div_value /= this->shape[j]; - } - - for (int j = dims - 1; j > -1; --j) { - index_new += index_old[perm[j]] * this->axis_offset[j]; - } - this->element[index_new] = temp.element[i]; - } - this->shape = new_shape; - - return *this; -} -template Tensor &Tensor::transpose(vector perm); -template Tensor &Tensor::transpose(vector perm); -template Tensor &Tensor::transpose(vector perm); -template Tensor &Tensor::transpose(vector perm); -template Tensor &Tensor::transpose(vector perm); -template Tensor &Tensor::transpose(vector perm); - -template -Tensor &Tensor::transpose(Tensor &input, vector perm) -{ - if (perm.size() == 0) { - for (int i = this->shape.size() - 1; i >= 0; i--) { - perm.push_back(i); - } - } - assert(perm.size() == input.shape.size()); - assert(this->get_size() == input.get_size()); - this->malloc_element(); - int dims = perm.size(); - int size = input.get_size(); - - vector index_old(dims, 0); - vector new_shape(dims, 0); - - for (int i = 0; i < dims; ++i) { - if (perm[i] < 0) - perm[i] = dims + perm[i]; - new_shape[i] = input.shape[perm[i]]; - } - - this->axis_offset[dims - 1] = 1; - - for (int i = dims - 2; i > -1; --i) { - this->axis_offset[i] = this->axis_offset[i + 1] * new_shape[i + 1]; - } - - for (int i = 0; i < size; ++i) { - int dim_div_value = i; - int index_new = 0; - for (int j = dims - 1; j > -1; --j) { - index_old[j] = dim_div_value % input.shape[j]; - dim_div_value /= input.shape[j]; - } - for (int j = dims - 1; j > -1; --j) { - index_new += index_old[perm[j]] * this->axis_offset[j]; - } - this->element[index_new] = input.element[i]; - } - this->shape = new_shape; - - return *this; -} -template Tensor &Tensor::transpose(Tensor &input, vector perm); -template Tensor &Tensor::transpose(Tensor &input, vector perm); -template Tensor &Tensor::transpose(Tensor &input, vector perm); -template Tensor &Tensor::transpose(Tensor &input, vector perm); -template Tensor &Tensor::transpose(Tensor &input, vector perm); -template Tensor &Tensor::transpose(Tensor &input, vector perm); - -template -void Tensor::print(std::vector axis_index_range, const char *message) -{ - if (axis_index_range.size() == 0) { - for (int i = 0; i < this->shape.size(); i++) { - axis_index_range.push_back(0); - axis_index_range.push_back(this->shape[i]); - } - } - assert(axis_index_range.size() == (2 * this->shape.size())); - std::vector axis_index(this->shape.size(), 0); - std::vector max_axis_index(this->shape.size(), 0); - std::vector min_axis_index(this->shape.size(), 0); - - std::cout << message << " ["; - int last_i = this->shape.size() - 1; - for (int i = 0; i < last_i; i++) { - axis_index_range[2 * i] = - (axis_index_range[2 * i] < 0) ? (this->shape[i] + axis_index_range[2 * i]) : axis_index_range[2 * i]; - axis_index_range[2 * i + 1] = (axis_index_range[2 * i + 1] < 0) ? (this->shape[i] + axis_index_range[2 * i + 1]) - : axis_index_range[2 * i + 1]; - if (axis_index_range[2 * i + 1] > this->shape[i]) { - axis_index_range[2 * i + 1] = this->shape[i]; - } - assert(axis_index_range[2 * i + 1] > axis_index_range[2 * i]); - axis_index[i] = axis_index_range[2 * i]; - min_axis_index[i] = axis_index_range[2 * i]; - max_axis_index[i] = axis_index_range[2 * i + 1] - 1; - std::cout << axis_index_range[2 * i] << ":" << axis_index_range[2 * i + 1] << ", "; - } - axis_index_range[2 * last_i] = (axis_index_range[2 * last_i] < 0) - ? (this->shape[last_i] + axis_index_range[2 * last_i]) - : axis_index_range[2 * last_i]; - axis_index_range[2 * last_i + 1] = (axis_index_range[2 * last_i + 1] < 0) - ? (this->shape[last_i] + axis_index_range[2 * last_i + 1]) - : axis_index_range[2 * last_i + 1]; - if (axis_index_range[2 * last_i + 1] > this->shape[last_i]) { - axis_index_range[2 * last_i + 1] = this->shape[last_i]; - } - assert(axis_index_range[2 * last_i + 1] > axis_index_range[2 * last_i]); - axis_index[last_i] = axis_index_range[2 * last_i]; - min_axis_index[last_i] = axis_index_range[2 * last_i]; - max_axis_index[last_i] = axis_index_range[2 * last_i + 1] - 1; - std::cout << axis_index_range[2 * last_i] << ":" << axis_index_range[2 * last_i + 1] << "] | "; - std::cout << "exponent:" << this->exponent << " | " << "dtype:" << this->get_dtype_string() << " | "; - this->print_shape(); - - int end_axis_num = this->shape.size(); - - while (1) { - for (int i = 0; i < (this->shape.size() - end_axis_num); i++) { - std::cout << " "; - } - for (int i = 0; i < end_axis_num; i++) { - std::cout << "["; - } - while (axis_index[last_i] < max_axis_index[last_i]) { - std::cout << +this->get_element_value(axis_index) << " "; - axis_index[last_i] += 1; - } - std::cout << +this->get_element_value(axis_index); - axis_index[last_i] += 1; - - end_axis_num = 0; - for (int i = last_i; i > 0; i--) { - if (axis_index[i] > max_axis_index[i]) { - axis_index[i] = min_axis_index[i]; - axis_index[i - 1] += 1; - end_axis_num += 1; - std::cout << "]"; - } - } - if (axis_index[0] > max_axis_index[0]) { - std::cout << "]\n"; - break; - } else { - for (int i = 0; i < end_axis_num; i++) { - std::cout << "\n"; - } - } - } -} -template void Tensor::print(std::vector axis_index_range = {}, const char *message = ""); -template void Tensor::print(std::vector axis_index_range = {}, const char *message = ""); -template void Tensor::print(std::vector axis_index_range = {}, const char *message = ""); -template void Tensor::print(std::vector axis_index_range = {}, const char *message = ""); -template void Tensor::print(std::vector axis_index_range = {}, const char *message = ""); -template void Tensor::print(std::vector axis_index_range = {}, const char *message = ""); - -template -int Tensor::get_element_index(const std::vector axis_index) -{ - assert(axis_index.size() == this->shape.size()); - int element_index = 0; - for (int i = 0; i < axis_index.size(); i++) { - element_index += axis_index[i] * this->axis_offset[i]; - } - return element_index; -} -template int Tensor::get_element_index(const std::vector axis_index); -template int Tensor::get_element_index(const std::vector axis_index); -template int Tensor::get_element_index(const std::vector axis_index); -template int Tensor::get_element_index(const std::vector axis_index); -template int Tensor::get_element_index(const std::vector axis_index); -template int Tensor::get_element_index(const std::vector axis_index); - -template -Tensor &Tensor::set_value(T value) -{ - assert(this->element != NULL); - dl::tool::set_value(this->element, value, this->size); - return *this; -} -template Tensor &Tensor::set_value(uint16_t value); -template Tensor &Tensor::set_value(uint8_t value); -template Tensor &Tensor::set_value(int32_t value); -template Tensor &Tensor::set_value(int16_t value); -template Tensor &Tensor::set_value(int8_t value); -template Tensor &Tensor::set_value(float value); - -template -Tensor &Tensor::set_value(Tensor &value) -{ - assert(this->element != NULL); - int dims = this->shape.size(); - assert(value.shape.size() == dims); - for (int i = 0; i < dims; ++i) { - assert((value.shape[i] == this->shape[i]) || (value.shape[i] == 1)); - } - if (this->is_same_shape(value)) // just copy - { - tool::copy_memory(this->element, value.element, this->get_size() * sizeof(T)); - } else // copy with broadcast - { - int min_offset = 0; - int min_offset_axis = value.axis_offset[0]; - for (int i = dims - 1; i >= 0; --i) { - if (value.shape[i] != this->shape[i]) { - min_offset = value.axis_offset[i]; - min_offset_axis = i; - break; - } - } - int min_offset_bytes = min_offset * sizeof(T); - - std::vector axis_index(dims, 0); - std::vector value_index(dims, 0); - T *value_ptr = NULL; - T *output_ptr = this->element; - while (axis_index[0] < this->shape[0]) { - value_ptr = value.element + value.get_element_index(value_index); - tool::copy_memory(output_ptr, value_ptr, min_offset_bytes); - output_ptr += min_offset; - axis_index[min_offset_axis] += 1; - for (int i = min_offset_axis; i > 0; --i) { - if (axis_index[i] == this->shape[i]) { - axis_index[i] = 0; - axis_index[i - 1] += 1; - value_index[i] = 0; - if (value.shape[i - 1] > 1) { - value_index[i - 1] += 1; - } - } else - break; - } - } - } - return *this; -} -template Tensor &Tensor::set_value(Tensor &value); -template Tensor &Tensor::set_value(Tensor &value); -template Tensor &Tensor::set_value(Tensor &value); -template Tensor &Tensor::set_value(Tensor &value); -template Tensor &Tensor::set_value(Tensor &value); -template Tensor &Tensor::set_value(Tensor &value); - -template -Tensor &Tensor::set_value(std::vector axis_index_range, T value) -{ - assert(this->element != NULL); - int dims = this->shape.size(); - assert(axis_index_range.size() == (2 * dims)); - std::vector loop_index_lower_bound(dims, 0); - std::vector loop_index_upper_bound(dims, 0); - - for (int i = 0; i < dims; ++i) { - loop_index_lower_bound[i] = - axis_index_range[2 * i] < 0 ? this->shape[i] + axis_index_range[2 * i] : axis_index_range[2 * i]; - loop_index_lower_bound[i] = loop_index_lower_bound[i] < 0 ? 0 : loop_index_lower_bound[i]; - loop_index_upper_bound[i] = axis_index_range[2 * i + 1] < 0 ? this->shape[i] + axis_index_range[2 * i + 1] - : axis_index_range[2 * i + 1]; - loop_index_upper_bound[i] = - loop_index_upper_bound[i] > this->shape[i] ? this->shape[i] : loop_index_upper_bound[i]; - if (loop_index_lower_bound[i] == loop_index_upper_bound[i]) - return *this; - assert(loop_index_lower_bound[i] < loop_index_upper_bound[i]); - } - std::vector loop_index = loop_index_lower_bound; - T *slice_ptr = NULL; - int min_offset = loop_index_upper_bound[dims - 1] - loop_index_lower_bound[dims - 1]; - - if (dims == 1) { - slice_ptr = this->element + this->get_element_index(loop_index); - tool::set_value(slice_ptr, value, min_offset); - return *this; - } - - while (loop_index[0] < loop_index_upper_bound[0]) { - slice_ptr = this->element + this->get_element_index(loop_index); - tool::set_value(slice_ptr, value, min_offset); - - loop_index[dims - 2] += 1; - for (int i = dims - 2; i > 0; --i) { - if (loop_index[i] == loop_index_upper_bound[i]) { - loop_index[i] = loop_index_lower_bound[i]; - loop_index[i - 1] += 1; - } else - break; - } - } - return *this; -} -template Tensor &Tensor::set_value(std::vector axis_index_range, uint16_t value); -template Tensor &Tensor::set_value(std::vector axis_index_range, uint8_t value); -template Tensor &Tensor::set_value(std::vector axis_index_range, int32_t value); -template Tensor &Tensor::set_value(std::vector axis_index_range, int16_t value); -template Tensor &Tensor::set_value(std::vector axis_index_range, int8_t value); -template Tensor &Tensor::set_value(std::vector axis_index_range, float value); - -template -Tensor &Tensor::set_value(std::vector axis_index_range, Tensor &value) -{ - assert(this->element != NULL); - int dims = this->shape.size(); - assert(axis_index_range.size() == (2 * dims)); - assert(value.shape.size() == dims); - std::vector output_shape(dims, 0); - std::vector loop_index_lower_bound(dims, 0); - std::vector loop_index_upper_bound(dims, 0); - - for (int i = 0; i < dims; ++i) { - loop_index_lower_bound[i] = - axis_index_range[2 * i] < 0 ? this->shape[i] + axis_index_range[2 * i] : axis_index_range[2 * i]; - loop_index_lower_bound[i] = loop_index_lower_bound[i] < 0 ? 0 : loop_index_lower_bound[i]; - loop_index_upper_bound[i] = axis_index_range[2 * i + 1] < 0 ? this->shape[i] + axis_index_range[2 * i + 1] - : axis_index_range[2 * i + 1]; - loop_index_upper_bound[i] = - loop_index_upper_bound[i] > this->shape[i] ? this->shape[i] : loop_index_upper_bound[i]; - if (loop_index_lower_bound[i] == loop_index_upper_bound[i]) - return *this; - assert(loop_index_lower_bound[i] < loop_index_upper_bound[i]); - output_shape[i] = loop_index_upper_bound[i] - loop_index_lower_bound[i]; - assert((value.shape[i] == output_shape[i]) || (value.shape[i] == 1)); - } - std::vector loop_index = loop_index_lower_bound; - T *slice_ptr = NULL; - T *value_ptr = value.element; - - if (value.shape == output_shape) // just copy - { - int min_offset = loop_index_upper_bound[dims - 1] - loop_index_lower_bound[dims - 1]; - int min_offset_axis = dims - 1; - for (int i = dims - 1; i >= 0; --i) { - if (this->shape[i] == output_shape[i]) { - min_offset = this->axis_offset[i] * this->shape[i]; - min_offset_axis = i; - } else - break; - } - int min_offset_bytes = min_offset * sizeof(T); - - if (min_offset_axis == 0) { - slice_ptr = this->element + this->get_element_index(loop_index); - tool::copy_memory(slice_ptr, value_ptr, min_offset_bytes); - return *this; - } - - while (loop_index[0] < loop_index_upper_bound[0]) { - slice_ptr = this->element + this->get_element_index(loop_index); - tool::copy_memory(slice_ptr, value_ptr, min_offset_bytes); - value_ptr += min_offset; - - loop_index[min_offset_axis - 1] += 1; - for (int i = min_offset_axis - 1; i > 0; --i) { - if (loop_index[i] == loop_index_upper_bound[i]) { - loop_index[i] = loop_index_lower_bound[i]; - loop_index[i - 1] += 1; - } else - break; - } - } - return *this; - } else // copy with broadcast - { - std::vector value_index(dims, 0); - - int min_offset = 0; - int min_offset_axis = value.axis_offset[0]; - bool broadcast_axis_flag = false; - for (int i = dims - 1; i >= 0; --i) { - if ((value.shape[i] != output_shape[i]) || (this->shape[i] != output_shape[i])) { - min_offset_axis = i; - min_offset = value.axis_offset[i]; - if (value.shape[i] != output_shape[i]) - broadcast_axis_flag = true; - break; - } - } - int min_offset_bytes = min_offset * sizeof(T); - - if (broadcast_axis_flag) { - while (loop_index[0] < loop_index_upper_bound[0]) { - slice_ptr = this->element + this->get_element_index(loop_index); - value_ptr = value.element + value.get_element_index(value_index); - tool::copy_memory(slice_ptr, value_ptr, min_offset_bytes); - loop_index[min_offset_axis] += 1; - for (int i = min_offset_axis; i > 0; --i) { - if (loop_index[i] == loop_index_upper_bound[i]) { - loop_index[i] = loop_index_lower_bound[i]; - loop_index[i - 1] += 1; - value_index[i] = 0; - if (value.shape[i - 1] > 1) { - value_index[i - 1] += 1; - } - } else - break; - } - } - return *this; - } else { - while (loop_index[0] < loop_index_upper_bound[0]) { - slice_ptr = this->element + this->get_element_index(loop_index); - value_ptr = value.element + value.get_element_index(value_index); - tool::copy_memory(slice_ptr, value_ptr, min_offset_bytes); - loop_index[min_offset_axis] += 1; - value_index[min_offset_axis] += 1; - for (int i = min_offset_axis; i > 0; --i) { - if (loop_index[i] == loop_index_upper_bound[i]) { - loop_index[i] = loop_index_lower_bound[i]; - loop_index[i - 1] += 1; - value_index[i] = 0; - if (value.shape[i - 1] > 1) { - value_index[i - 1] += 1; - } - } else - break; - } - } - return *this; - } - } -} -template Tensor &Tensor::set_value(std::vector axis_index_range, Tensor &value); -template Tensor &Tensor::set_value(std::vector axis_index_range, Tensor &value); -template Tensor &Tensor::set_value(std::vector axis_index_range, Tensor &value); -template Tensor &Tensor::set_value(std::vector axis_index_range, Tensor &value); -template Tensor &Tensor::set_value(std::vector axis_index_range, Tensor &value); -template Tensor &Tensor::set_value(std::vector axis_index_range, Tensor &value); - -template -Tensor Tensor::slice(std::vector axis_index_range) -{ - assert(this->element != NULL); - int dims = this->shape.size(); - assert(axis_index_range.size() == (2 * dims)); - std::vector output_shape(dims, 0); - std::vector loop_index_lower_bound(dims, 0); - std::vector loop_index_upper_bound(dims, 0); - - for (int i = 0; i < dims; ++i) { - loop_index_lower_bound[i] = - axis_index_range[2 * i] < 0 ? this->shape[i] + axis_index_range[2 * i] : axis_index_range[2 * i]; - loop_index_lower_bound[i] = loop_index_lower_bound[i] < 0 ? 0 : loop_index_lower_bound[i]; - loop_index_upper_bound[i] = axis_index_range[2 * i + 1] < 0 ? this->shape[i] + axis_index_range[2 * i + 1] - : axis_index_range[2 * i + 1]; - loop_index_upper_bound[i] = - loop_index_upper_bound[i] > this->shape[i] ? this->shape[i] : loop_index_upper_bound[i]; - assert(loop_index_lower_bound[i] < loop_index_upper_bound[i]); - output_shape[i] = loop_index_upper_bound[i] - loop_index_lower_bound[i]; - } - std::vector loop_index = loop_index_lower_bound; - - Tensor output; - output.set_shape(output_shape).set_exponent(this->exponent).malloc_element(); - T *output_ptr = output.element; - T *slice_ptr = NULL; - int min_offset = output_shape.back(); - int min_offset_bytes = min_offset * sizeof(T); - - if (dims == 1) { - slice_ptr = this->element + this->get_element_index(loop_index); - tool::copy_memory(output_ptr, slice_ptr, min_offset_bytes); - return output; - } - - while (loop_index[0] < loop_index_upper_bound[0]) { - slice_ptr = this->element + this->get_element_index(loop_index); - tool::copy_memory(output_ptr, slice_ptr, min_offset_bytes); - output_ptr += min_offset; - - loop_index[dims - 2] += 1; - for (int i = dims - 2; i > 0; --i) { - if (loop_index[i] == loop_index_upper_bound[i]) { - loop_index[i] = loop_index_lower_bound[i]; - loop_index[i - 1] += 1; - } else - break; - } - } - return output; -} -template Tensor Tensor::slice(std::vector axis_index_range); -template Tensor Tensor::slice(std::vector axis_index_range); -template Tensor Tensor::slice(std::vector axis_index_range); -template Tensor Tensor::slice(std::vector axis_index_range); -template Tensor Tensor::slice(std::vector axis_index_range); -template Tensor Tensor::slice(std::vector axis_index_range); - -template -Tensor &Tensor::reverse(std::vector axis) -{ - if (axis.size() == 0) { - int loop_num = this->size / 2; - int max_index = this->size - 1; - T temp; - for (int i = 0; i < loop_num; ++i) { - temp = this->element[i]; - this->element[i] = this->element[max_index - i]; - this->element[max_index - i] = temp; - } - return *this; - } - - int dims = this->shape.size(); - int reverse_num = axis.size(); - for (int i = 0; i < reverse_num; ++i) { - if (axis[i] < 0) - axis[i] += dims; - assert(axis[i] < dims); - } - sort(axis.begin(), axis.end()); - vector max_axis_index(reverse_num, 0); - for (int i = 0; i < reverse_num; ++i) { - max_axis_index[i] = this->shape[axis[i]] - 1; - } - - vector loop_index(dims, 0); - int min_offset_axis = axis.back(); - int min_offset = this->axis_offset[min_offset_axis]; - int min_offset_bytes = min_offset * sizeof(T); - Tensor temp(*this, true); - temp.set_auto_free(true); - - T *output_ptr = this->element; - T *temp_ptr = NULL; - while (loop_index[0] < this->shape[0]) { - vector loop_index_tmp = loop_index; - for (int i = 0; i < reverse_num; ++i) { - loop_index_tmp[axis[i]] = max_axis_index[i] - loop_index_tmp[axis[i]]; - } - temp_ptr = temp.element + temp.get_element_index(loop_index_tmp); - tool::copy_memory(output_ptr, temp_ptr, min_offset_bytes); - output_ptr += min_offset; - loop_index[min_offset_axis] += 1; - for (int i = min_offset_axis; i > 0; --i) { - if (loop_index[i] == this->shape[i]) { - loop_index[i] = 0; - loop_index[i - 1] += 1; - } else - break; - } - } - return *this; -} -template Tensor &Tensor::reverse(vector axis); -template Tensor &Tensor::reverse(vector axis); -template Tensor &Tensor::reverse(vector axis); -template Tensor &Tensor::reverse(vector axis); -template Tensor &Tensor::reverse(vector axis); -template Tensor &Tensor::reverse(vector axis); - -template -bool Tensor::convert_from(TensorBase *input) -{ - if (input->size <= 0 || input->shape.empty()) { - return false; - } - if (input->dtype == DATA_TYPE_FLOAT) { - Tensor *tensor = static_cast *>(input); - this->convert_from(*tensor); - } else if (input->dtype == DATA_TYPE_INT8) { - Tensor *tensor = static_cast *>(input); - this->convert_from(*tensor); - } else if (input->dtype == DATA_TYPE_INT16) { - Tensor *tensor = static_cast *>(input); - this->convert_from(*tensor); - } else { - return false; - } - return false; -} -template bool Tensor::convert_from(TensorBase *input); -template bool Tensor::convert_from(TensorBase *input); -template bool Tensor::convert_from(TensorBase *input); - -template -bool Tensor::convert_from(const Tensor &input) -{ - int8_t *in_element = input.element; - T *element = this->element; - if (in_element && element) { - if (this->is_same_shape(input)) { - if (std::is_same::value) { // int8 -> int8 - int exponent = input.exponent - this->exponent; - if (exponent == 0) { - tool::copy_memory(element, in_element, this->get_size() * sizeof(T)); - } else { - // TODO:: more effective implementation - float scale = 1.0 / (DL_SCALE(this->exponent)); - for (int i = 0; i < this->get_size(); i++) { - element[i] = quantize( - dequantize(in_element[i], DL_SCALE(input.exponent)), scale, DL_QUANT8_MIN, DL_QUANT8_MAX); - } - } - } else if (std::is_same::value) { // int8 -> int16 - int exponent = input.exponent - this->exponent; - if (exponent == 0) { - for (int i = 0; i < this->get_size(); i++) { - element[i] = in_element[i]; - } - } else { - // TODO:: more effective implementation - float scale = 1.0 / (DL_SCALE(this->exponent)); - for (int i = 0; i < this->get_size(); i++) { - element[i] = quantize( - dequantize(in_element[i], DL_SCALE(input.exponent)), scale, DL_QUANT16_MIN, DL_QUANT16_MAX); - } - } - } else if (std::is_same::value) { // int8 -> float - for (int i = 0; i < this->get_size(); i++) { - element[i] = dequantize(in_element[i], DL_SCALE(input.exponent)); - } - } else { - return false; - } - } - return false; - } else { - return false; - } - - return true; -} -template bool Tensor::convert_from(const Tensor &input); -template bool Tensor::convert_from(const Tensor &input); -template bool Tensor::convert_from(const Tensor &input); - -template -bool Tensor::convert_from(const Tensor &input) -{ - int16_t *in_element = input.element; - T *element = this->element; - if (in_element && element) { - if (this->is_same_shape(input)) { - if (std::is_same::value) { // int16 -> int8 - // TODO:: more effective implementation - float scale = 1.0 / (DL_SCALE(this->exponent)); - for (int i = 0; i < this->get_size(); i++) { - element[i] = quantize( - dequantize(in_element[i], DL_SCALE(input.exponent)), scale, DL_QUANT8_MIN, DL_QUANT8_MAX); - } - } else if (std::is_same::value) { // int16 -> int16 - int exponent = input.exponent - this->exponent; - if (exponent == 0) { - tool::copy_memory(element, in_element, this->get_size() * sizeof(T)); - } else { - // TODO:: more effective implementation - float scale = 1.0 / (DL_SCALE(this->exponent)); - for (int i = 0; i < this->get_size(); i++) { - element[i] = quantize( - dequantize(in_element[i], DL_SCALE(input.exponent)), scale, DL_QUANT16_MIN, DL_QUANT16_MAX); - } - } - } else if (std::is_same::value) { // int16 -> float - for (int i = 0; i < this->get_size(); i++) { - element[i] = dequantize(in_element[i], DL_SCALE(input.exponent)); - } - } else { - return false; - } - } - return false; - } else { - return false; - } - - return true; -} -template bool Tensor::convert_from(const Tensor &input); -template bool Tensor::convert_from(const Tensor &input); -template bool Tensor::convert_from(const Tensor &input); - -template -bool Tensor::convert_from(const Tensor &input) -{ - float *in_element = input.element; - T *element = this->element; - if (in_element && element) { - if (this->is_same_shape(input)) { - if (std::is_same::value) { // float -> int8 - // TODO:: more effective implementation - float scale = 1.0 / (DL_SCALE(this->exponent)); - for (int i = 0; i < this->get_size(); i++) { - element[i] = quantize(in_element[i], scale, DL_QUANT8_MIN, DL_QUANT8_MAX); - } - } else if (std::is_same::value) { // float -> int16 - float scale = 1.0 / (DL_SCALE(this->exponent)); - for (int i = 0; i < this->get_size(); i++) { - element[i] = quantize(in_element[i], scale, DL_QUANT16_MIN, DL_QUANT16_MAX); - } - } else if (std::is_same::value) { // float -> float - tool::copy_memory(element, in_element, this->get_size() * sizeof(T)); - } else { - return false; - } - } - return false; - } else { - return false; - } - - return true; -} -template bool Tensor::convert_from(const Tensor &input); -template bool Tensor::convert_from(const Tensor &input); -template bool Tensor::convert_from(const Tensor &input); - -// template -// Tensor &Tensor::set_padding_value(const vector padding, T value) -// { -// assert(this->shape.size()); // call set_shape() first -// assert(this->element != NULL); -// assert(this->shape.size() == 3); // TODO: || this->shape.size() == 2 - -// bool no_padding = true; -// for (size_t i = 0; i < padding.size(); i++) -// { -// assert(padding[i] <= this->padding[i]); - -// if (padding[i] > 0) -// { -// no_padding = false; -// break; -// } -// } -// if (no_padding) -// return *this; // return directly if no padding at all - -// if (this->shape.size() == 3) -// { -// // top -// int w_padding = (this->shape[1] + padding[2] + padding[3]) * this->shape[2]; -// T *start_ptr = this->get_element_ptr(padding); -// int w_offset_0 = this->shape[1] * this->shape[2]; - -// for (int i = 0; i < padding[0]; ++i) -// { -// dl::tool::set_value(start_ptr, value, w_padding); -// start_ptr += w_offset_0; -// } - -// // left & right -// int w_offset_1 = (padding[2] + this->shape[1]) * this->shape[2]; -// int w_offset_2 = w_offset_0 - w_offset_1; -// int w_len_1 = padding[2] * this->shape[2]; -// int w_len_2 = padding[3] * this->shape[2]; - -// for (int i = 0; i < this->shape[0]; ++i) -// { -// dl::tool::set_value(start_ptr, value, w_len_1); -// start_ptr += w_offset_1; -// dl::tool::set_value(start_ptr, value, w_len_2); -// start_ptr += w_offset_2; -// } - -// // bottom -// for (int i = 0; i < padding[1]; ++i) -// { -// dl::tool::set_value(start_ptr, value, w_padding); -// start_ptr += w_offset_0; -// } -// } - -// else if (this->shape.size() == 2) -// { -// printf("Tensor.set_padding_value with this->shape.size() == 2 not implement yet.\n"); -// } - -// return *this; -// } -// template Tensor &Tensor::set_padding_value(const vector padding, uint16_t value); -// template Tensor &Tensor::set_padding_value(const vector padding, uint8_t value); -// template Tensor &Tensor::set_padding_value(const vector padding, int32_t value); -// template Tensor &Tensor::set_padding_value(const vector padding, int16_t value); -// template Tensor &Tensor::set_padding_value(const vector padding, int8_t value); -// template Tensor &Tensor::set_padding_value(const vector padding, float value); - -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/include/fbs_loader.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/include/fbs_loader.hpp deleted file mode 100644 index 5387db6f..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/include/fbs_loader.hpp +++ /dev/null @@ -1,93 +0,0 @@ -#pragma once - -#include "esp_idf_version.h" -#include "esp_log.h" -#include "esp_partition.h" -#if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(5, 0, 0) -#include "spi_flash_mmap.h" -#endif -#include "fbs_model.hpp" - -namespace fbs { - -typedef enum { - MODEL_LOCATION_IN_FLASH_RODATA = 0, // The model in FLASH .rodata section - MODEL_LOCATION_IN_FLASH_PARTITION = 1, // The model in SPIFFS - MODEL_LOCATION_IN_SDCARD = 2, // The model in SDCard - MODEL_LOCATION_MAX = MODEL_LOCATION_IN_SDCARD, -} model_location_type_t; - -/** - * @brief Class for parser the flatbuffers. - * - */ -class FbsLoader { -public: - /** - * @brief Construct a new FbsLoader object. - * - * @param rodata_address_or_partition_label_or_path - * The address of model data while location is MODEL_LOCATION_IN_FLASH_RODATA. - * The label of partition while location is MODEL_LOCATION_IN_FLASH_PARTITION. - * The path of model while location is MODEL_LOCATION_IN_SDCARD. - * @param location The model location. - */ - FbsLoader(const char *rodata_address_or_partition_label_or_path = nullptr, - model_location_type_t location = MODEL_LOCATION_IN_FLASH_RODATA); - - /** - * @brief Destroy the FbsLoader object. - */ - ~FbsLoader(); - - /** - * @brief Load the model. If there are multiple sub-models, the first sub-model will be loaded. - * - * @param key NULL or a 128-bit AES key, like {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, - * 0x0b, 0x0c, 0x0d, 0x0e, 0x0f} - * - * @return Return nullptr if loading fails. Otherwise return the pointer of FbsModel. - */ - FbsModel *load(const uint8_t *key = nullptr); - - /** - * @brief Load the model by model index. - * - * @param model_index The index of model. - * @param key NULL or a 128-bit AES key, like {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, - * 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}. - * - * @return Return nullptr if loading fails. Otherwise return the pointer of FbsModel. - */ - FbsModel *load(const int model_index, const uint8_t *key = nullptr); - - /** - * @brief Load the model by model name. - * - * @param model_name The name of model. - * @param key NULL or a 128-bit AES key, like {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, - * 0x0b, 0x0c, 0x0d, 0x0e, 0x0f} - * - * @return Return nullptr if loading fails. Otherwise return the pointer of FbsModel. - */ - FbsModel *load(const char *model_name, const uint8_t *key = nullptr); - - /** - * @brief Get the number of models. - * - * @return The number of models - */ - int get_model_num(); - - /** - * @brief List all model's name - */ - void list_models(); - -private: - void *m_mmap_handle; - model_location_type_t m_location; - const void *m_fbs_buf; -}; - -} // namespace fbs diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/include/fbs_model.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/include/fbs_model.hpp deleted file mode 100644 index 054d6710..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/include/fbs_model.hpp +++ /dev/null @@ -1,267 +0,0 @@ -#pragma once - -#include "dl_tensor_base.hpp" -#include "esp_log.h" -#include -#include -#include -#include -#include - -namespace fbs { - -/** - * @brief Flatbuffer model object. - */ -class FbsModel { -public: - /** - * @brief Construct a new FbsModel object. - * - * @param name The label of partition while location is MODEL_LOCATION_IN_FLASH. - * The path of model while location is MODEL_LOCATION_IN_SDCARD. - * @param location The model location. - */ - FbsModel(const void *data, bool auto_free = false); - - /** - * @brief Destroy the FbsModel object. - */ - ~FbsModel(); - - /** - * @brief Print the model information. - */ - void print(); - - /** - * @brief Return vector of node name in the order of execution. - * - * @return topological sort of node name. - */ - std::vector topological_sort(); - - /** - * @brief Get the attribute of node. - * - * @param node_name The name of operation. - * @param attribute_name The name of attribute. - * @param ret_value The attribute value. - * - * @return esp_err_t Return ESP_OK if get successfully. Otherwise return ESP_FAIL. - */ - esp_err_t get_operation_attribute(std::string node_name, std::string attribute_name, int &ret_value); - esp_err_t get_operation_attribute(std::string node_name, std::string attribute_name, float &ret_value); - esp_err_t get_operation_attribute(std::string node_name, std::string attribute_name, std::string &ret_value); - esp_err_t get_operation_attribute(std::string node_name, std::string attribute_name, std::vector &ret_value); - esp_err_t get_operation_attribute(std::string node_name, std::string attribute_name, std::vector &ret_value); - esp_err_t get_operation_attribute(std::string node_name, std::string attribute_name, dl::quant_type_t &ret_value); - esp_err_t get_operation_attribute(std::string node_name, - std::string attribute_name, - dl::activation_type_t &ret_value); - esp_err_t get_operation_attribute(std::string node_name, std::string attribute_name, dl::resize_mode_t &ret_value); - - /** - * @brief Get operation output shape - * - * @param node_name The name of operation. - * @param index The index of outputs - * @param ret_value Return shape value. - * - * @return esp_err_t Return ESP_OK if get successfully. Otherwise return ESP_FAIL. - */ - esp_err_t get_operation_output_shape(std::string node_name, int index, std::vector &ret_value); - - /** - * @brief Get the attribute of node. - * - * @param node_name The name of operation. - * @param inputs The vector of operation inputs. - * @param outputs The vector of operation outputs. - * - * @return esp_err_t Return ESP_OK if get successfully. Otherwise return ESP_FAIL. - */ - esp_err_t get_operation_inputs_and_outputs(std::string node_name, - std::vector &inputs, - std::vector &outputs); - - /** - * @brief Get operation type, "Conv", "Linear" etc - * - * @param node_name The name of operation - * - * @return The type of operation. - */ - std::string get_operation_type(std::string node_name); - - /** - * @brief Return if the variable is a parameter - * - * @param node_name The name of operation - * @param index The index of the variable - * @param copy If true, return a copy of the variable - * @param caps Bitwise OR of MALLOC_CAP_* flags indicating the type of memory to be returned - * - * @return TensorBase - */ - dl::TensorBase *get_operation_parameter(std::string node_name, - int index = 1, - bool copy = true, - uint32_t caps = MALLOC_CAP_SPIRAM); - - /** - * @brief Get LUT(Look Up Table) if the operation has LUT - * - * @param node_name The name of operation - * @param copy If true, return a copy of the variable - * @param caps Bitwise OR of MALLOC_CAP_* flags indicating the type of memory to be returned - * @param attribute_name The name of LUT attribute - */ - dl::TensorBase *get_operation_lut(std::string node_name, - bool copy = true, - uint32_t caps = MALLOC_CAP_SPIRAM, - std::string attribute_name = "lut"); - - /** - * @brief return true if the variable is a parameter - * - * @param name Variable name - * - * @return true if the variable is a parameter else false - */ - bool is_parameter(std::string name); - - /** - * @brief Get the raw data of FlatBuffers::Dl::Tensor. - * - * @param tensor_name The name of Tensor. - * - * @return uint8_t * The pointer of raw data. - */ - const void *get_tensor_raw_data(std::string tensor_name); - - /** - * @brief Get the element type of tensor tensor. - * - * @param tensor_name The tensor name. - * - * @return FlatBuffers::Dl::TensorDataType - */ - dl::dtype_t get_tensor_dtype(std::string tensor_name); - - /** - * @brief Get the shape of tensor. - * - * @param tensor_name The name of tensor. - * - * @return std::vector The shape of tensor. - */ - std::vector get_tensor_shape(std::string tensor_name); - - /** - * @brief Get the exponents of tensor. - * - * @warning When quantization is PER_CHANNEL, the size of exponents is same as out_channels. - * When quantization is PER_TENSOR, the size of exponents is 1. - * - * @param tensor_name The name of tensor. - * - * @return The exponents of tensor. - */ - std::vector get_tensor_exponents(std::string tensor_name); - - /** - * @brief Get the element type of value_info. - * - * @param var_name The value_info name. - * - * @return dl::dtype_t - */ - dl::dtype_t get_value_info_dtype(std::string var_name); - - /** - * @brief Get the shape of value_info. - * - * @param var_name The value_info name. - * - * @return the shape of value_info. - */ - std::vector get_value_info_shape(std::string var_name); - - /** - * @brief Get the exponent of value_info. Only support PER_TENSOR quantization. - * - * @param var_name The value_info name. - * - * @return the exponent of value_info - */ - int get_value_info_exponent(std::string var_name); - - /** - * @brief Get the raw data of test input tensor. - * - * @param tensor_name The name of test input tensor. - * - * @return uint8_t * The pointer of raw data. - */ - const void *get_test_input_tensor_raw_data(std::string tensor_name); - - /** - * @brief Get the raw data of test output tensor. - * - * @param tensor_name The name of test output tensor. - * - * @return uint8_t * The pointer of raw data. - */ - const void *get_test_output_tensor_raw_data(std::string tensor_name); - - /** - * @brief Get the graph inputs. - * - * @return the name of inputs - */ - std::vector get_graph_inputs(); - - /** - * @brief Get the graph outputs. - * - * @return the name of ounputs - */ - std::vector get_graph_outputs(); - - /** - * @brief Clear all map - */ - void clear_map(); - - /** - * @brief Load all map - */ - void load_map(); - - /** - * @brief Get model name - */ - std::string get_model_name(); - - /** - * @brief Get model version - */ - int64_t get_model_version(); - - /** - * @brief Get model doc string - */ - std::string get_model_doc_string(); - -private: - bool m_auto_free; - const uint8_t *m_data; - const void *m_model; - std::map m_name_to_node_map; - std::map m_name_to_initial_tensor_map; - std::map m_name_to_value_info_map; - std::unordered_map m_name_to_test_inputs_value_map; - std::unordered_map m_name_to_test_outputs_value_map; -}; -} // namespace fbs diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/lib/esp32p4/libfbs_model.a b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/lib/esp32p4/libfbs_model.a deleted file mode 100644 index 62a301bb..00000000 Binary files a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/lib/esp32p4/libfbs_model.a and /dev/null differ diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/lib/esp32s3/libfbs_model.a b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/lib/esp32s3/libfbs_model.a deleted file mode 100644 index f2f82cb9..00000000 Binary files a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/lib/esp32s3/libfbs_model.a and /dev/null differ diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/src/fbs_loader.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/src/fbs_loader.cpp deleted file mode 100644 index c5da3009..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/fbs_loader/src/fbs_loader.cpp +++ /dev/null @@ -1,254 +0,0 @@ -#include "fbs_loader.hpp" -#include "mbedtls/aes.h" - -static const char *TAG = "FbsLoader"; - -namespace fbs { - -/** - * @brief This function is used to decrypt the AES 128-bit CTR mode encrypted data. - * AES (Advanced Encryption Standard) is a widely-used symmetric encryption algorithm that provides strong security for - * data protection CTR mode converts the block cipher into a stream cipher, allowing it to encrypt data of any length - * without the need for padding - * - * @param ciphertext Input Fbs data encrypted by AES 128-bit CTR mode - * @param plaintext Decrypted data - * @param size Size of input data - * @param key 128-bit AES key - */ -void fbs_aes_crypt_ctr(const uint8_t *ciphertext, uint8_t *plaintext, size_t size, const uint8_t *key) -{ - mbedtls_aes_context aes_ctx; - size_t offset = 0; - uint8_t nonce[16] = { - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F}; - uint8_t stream_block[16]; - mbedtls_aes_init(&aes_ctx); - mbedtls_aes_setkey_enc(&aes_ctx, key, 128); // 128-bit key - mbedtls_aes_crypt_ctr(&aes_ctx, size, &offset, nonce, stream_block, ciphertext, plaintext); - mbedtls_aes_free(&aes_ctx); -} - -/** - FBS_FILE_FORMAT_EDL1: - { - char[4]: "EDL1", - uint32: the mode of entru - uint32: the length of data - uint8[]: the data - } - - FBS_FILE_FORMAT_PDL1: - { - "PDL1": char[4] - model_num: uint32 - model1_data_offset: uint32 - model1_name_offset: uint32 - model1_name_length: uint32 - model2_data_offset: uint32 - model2_name_offset: uint32 - model2_name_length: uint32 - ... - model1_name, - model2_name, - ... - model1_data(format:FBS_FILE_FORMAT_EDL1), - model2_data(format:FBS_FILE_FORMAT_EDL1), - ... - } -*/ -typedef enum { FBS_FILE_FORMAT_UNK = 0, FBS_FILE_FORMAT_EDL1 = 1, FBS_FILE_FORMAT_PDL1 = 2 } fbs_file_format_t; - -fbs_file_format_t get_model_format(const char *format) -{ - char str[5]; - memcpy(str, format, 4); - str[4] = '\0'; - - if (strcmp(str, "EDL1") == 0) { - return FBS_FILE_FORMAT_EDL1; - } else if (strcmp(str, "PDL1") == 0) { - return FBS_FILE_FORMAT_PDL1; - } else { - return FBS_FILE_FORMAT_UNK; - } - - return FBS_FILE_FORMAT_UNK; -} - -esp_err_t get_model_offset_by_index(const uint8_t *fbs_buf, uint32_t index, uint32_t &offset) -{ - const uint32_t *header = (const uint32_t *)fbs_buf; - uint32_t model_num = header[1]; - if (index >= model_num) { - ESP_LOGE(TAG, "The model index is out of range."); - return ESP_FAIL; - } - - offset = header[2 + index * 3]; - return ESP_OK; -} - -FbsModel *create_fbs_model(const uint8_t *model_buf, const uint8_t *key) -{ - if (model_buf == nullptr) { - ESP_LOGE(TAG, "Model's flatbuffers is empty."); - return nullptr; - } - - uint32_t *header = (uint32_t *)model_buf; - uint32_t mode = header[1]; // cryptographic mode, 0: without encryption, 1: aes encryption - uint32_t size = header[2]; - if (mode != 0 && key == NULL) { - ESP_LOGE(TAG, "This is a cryptographic model, please enter the secret key!"); - return nullptr; - } - - model_buf += 12; - if (mode == 0) { // without encryption - return new FbsModel(model_buf, false); - } else if (mode == 1) { // 128-bit AES encryption - uint8_t *m_data = (uint8_t *)heap_caps_malloc(size, MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM); - fbs_aes_crypt_ctr(model_buf, m_data, size, key); - return new FbsModel(m_data, true); - } - - return nullptr; -} - -FbsLoader::FbsLoader(const char *name, model_location_type_t location) : - m_mmap_handle(nullptr), m_location(location), m_fbs_buf(nullptr) -{ - if (name == nullptr) { - return; - } - - if (m_location == MODEL_LOCATION_IN_FLASH_RODATA) { - m_fbs_buf = (const void *)name; - } else if (m_location == MODEL_LOCATION_IN_FLASH_PARTITION) { - const esp_partition_t *partition = - esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_ANY, name); - if (partition) { - int free_pages = spi_flash_mmap_get_free_pages(SPI_FLASH_MMAP_DATA); - uint32_t storage_size = free_pages * 64 * 1024; // Byte - ESP_LOGI(TAG, "The storage free size is %ld KB", storage_size / 1024); - ESP_LOGI(TAG, "The partition size is %ld KB", partition->size / 1024); - if (storage_size < partition->size) { - ESP_LOGE(TAG, - "The storage free size of this board is less than %s partition required size", - partition->label); - } - this->m_mmap_handle = (esp_partition_mmap_handle_t *)malloc(sizeof(esp_partition_mmap_handle_t)); - ESP_ERROR_CHECK(esp_partition_mmap(partition, - 0, - partition->size, - ESP_PARTITION_MMAP_DATA, - &this->m_fbs_buf, - static_cast(this->m_mmap_handle))); - } else { - ESP_LOGE(TAG, "Can not find %s in partition table", name); - } - } else if (m_location == MODEL_LOCATION_IN_SDCARD) { - // TODO - } -} - -FbsLoader::~FbsLoader() -{ - if (m_location == MODEL_LOCATION_IN_FLASH_PARTITION) { - esp_partition_munmap(*static_cast(this->m_mmap_handle)); // support esp-idf v5 - if (this->m_mmap_handle) { - free(this->m_mmap_handle); - this->m_mmap_handle = nullptr; - } - } else if (m_location == MODEL_LOCATION_IN_SDCARD) { - // TODO - } -} - -FbsModel *FbsLoader::load(const int model_index, const uint8_t *key) -{ - if (this->m_fbs_buf == nullptr) { - ESP_LOGE(TAG, "Model's flatbuffers is empty."); - return nullptr; - } - - uint8_t *model_buf = (uint8_t *)m_fbs_buf; - uint32_t offset = 0; - fbs_file_format_t format = get_model_format((const char *)m_fbs_buf); - if (format == FBS_FILE_FORMAT_PDL1) { - // packed multiple espdl models - if (get_model_offset_by_index(model_buf, model_index, offset) != ESP_OK) { - return nullptr; - } - } else if (format == FBS_FILE_FORMAT_EDL1) { - // single espdl model - if (model_index > 0) { - ESP_LOGW(TAG, "There are only one model in the flatbuffers, ignore the input model index!"); - } - offset = 0; - } else { - ESP_LOGE(TAG, "Unsupported format, or the model file is corrupted!"); - return nullptr; - } - return create_fbs_model(model_buf + offset, key); -} - -FbsModel *FbsLoader::load(const uint8_t *key) -{ - return this->load(0, key); -} - -FbsModel *FbsLoader::load(const char *model_name, const uint8_t *key) -{ - return this->load(0, key); -} - -int FbsLoader::get_model_num() -{ - if (this->m_fbs_buf == nullptr) { - return 0; - } - - uint8_t *model_buf = (uint8_t *)m_fbs_buf; - fbs_file_format_t format = get_model_format((const char *)m_fbs_buf); - if (format == FBS_FILE_FORMAT_PDL1) { - // packed multiple espdl models - uint32_t *header = (uint32_t *)model_buf; - uint32_t model_num = header[1]; - return model_num; - } else if (format == FBS_FILE_FORMAT_EDL1) { - // single espdl model - return 1; - } else { - ESP_LOGE(TAG, "Unsupported format, or the model file is corrupted!"); - return 0; - } - - return 0; -} - -void FbsLoader::list_models() -{ - if (this->m_fbs_buf == nullptr) { - ESP_LOGE(TAG, "Model's flatbuffers is empty."); - return; - } - - fbs_file_format_t format = get_model_format((const char *)m_fbs_buf); - if (format == FBS_FILE_FORMAT_PDL1) { - // packed multiple espdl models - uint32_t *header = (uint32_t *)m_fbs_buf; - uint32_t model_num = header[1]; - for (int i = 0; i < model_num; i++) { - uint32_t name_offset = header[2 + 3 * i + 1]; - uint32_t name_length = header[2 + 3 * i + 2]; - std::string name((const char *)m_fbs_buf + name_offset, name_length); - ESP_LOGI(TAG, "model name: %s, index:%d", name.c_str(), i); - } - } else if (format == FBS_FILE_FORMAT_EDL1) { - ESP_LOGI(TAG, "There are only one model in the flatbuffers without model name."); - } -} - -} // namespace fbs diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/idf_component.yml b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/idf_component.yml deleted file mode 100644 index 80ae631e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/idf_component.yml +++ /dev/null @@ -1,5 +0,0 @@ -version: "3.0.0" -description: esp-dl is a lightweight and efficient neural network inference framework designed specifically for ESP series chips. -url: https://github.com/espressif/esp-dl/tree/master/esp-dl -dependencies: - idf: ">=5.3" diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/detect/dl_detect_define.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/detect/dl_detect_define.hpp deleted file mode 100644 index 12004ab5..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/detect/dl_detect_define.hpp +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once - -#include "dl_define.hpp" -#include "dl_tool.hpp" -#include - -namespace dl { -namespace detect { -typedef struct { - int category; /* box; /* keypoint; /* b.score; -} - -typedef struct { - int stride_y; - int stride_x; - int offset_y; - int offset_x; -} anchor_point_stage_t; - -typedef struct { - int stride_y; - int stride_x; - int offset_y; - int offset_x; - std::vector> anchor_shape; -} anchor_box_stage_t; -} // namespace detect -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/detect/dl_detect_postprocessor.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/detect/dl_detect_postprocessor.cpp deleted file mode 100644 index c706daa7..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/detect/dl_detect_postprocessor.cpp +++ /dev/null @@ -1,66 +0,0 @@ -#include "dl_detect_postprocessor.hpp" - -namespace dl { -namespace detect { -void DetectPostprocessor::nms() -{ - dl::tool::Latency latency; - latency.start(); - int kept_number = 0; - for (std::list::iterator kept = this->box_list.begin(); kept != this->box_list.end(); kept++) { - kept_number++; - - if (kept_number >= this->top_k) { - this->box_list.erase(++kept, this->box_list.end()); - break; - } - - int kept_area = (kept->box[2] - kept->box[0] + 1) * (kept->box[3] - kept->box[1] + 1); - - std::list::iterator other = kept; - other++; - for (; other != this->box_list.end();) { - int inter_lt_x = DL_MAX(kept->box[0], other->box[0]); - int inter_lt_y = DL_MAX(kept->box[1], other->box[1]); - int inter_rb_x = DL_MIN(kept->box[2], other->box[2]); - int inter_rb_y = DL_MIN(kept->box[3], other->box[3]); - - int inter_height = inter_rb_y - inter_lt_y + 1; - int inter_width = inter_rb_x - inter_lt_x + 1; - - if (inter_height > 0 && inter_width > 0) { - int other_area = (other->box[2] - other->box[0] + 1) * (other->box[3] - other->box[1] + 1); - int inter_area = inter_height * inter_width; - float iou = (float)inter_area / (kept_area + other_area - inter_area); - if (iou > this->nms_threshold) { - other = this->box_list.erase(other); - continue; - } - } - other++; - } - } - latency.end(); - latency.print("detect", "postprocess::nms"); -} - -std::list &DetectPostprocessor::get_result(const std::vector &input_shape) -{ - for (result_t &res : this->box_list) { - for (int i = 0; i < res.box.size(); i++) { - if (i % 2 == 0) - res.box[i] = DL_CLIP(res.box[i], 0, input_shape[1] - 1); - else - res.box[i] = DL_CLIP(res.box[i], 0, input_shape[0] - 1); - } - for (int i = 0; i < res.keypoint.size(); i++) { - if (i % 2 == 0) - res.keypoint[i] = DL_CLIP(res.keypoint[i], 0, input_shape[1] - 1); - else - res.keypoint[i] = DL_CLIP(res.keypoint[i], 0, input_shape[0] - 1); - } - } - return this->box_list; -} -} // namespace detect -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/detect/dl_detect_postprocessor.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/detect/dl_detect_postprocessor.hpp deleted file mode 100644 index ff672e37..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/detect/dl_detect_postprocessor.hpp +++ /dev/null @@ -1,58 +0,0 @@ -#pragma once -#include "dl_detect_define.hpp" -#include -#include -#include - -namespace dl { -namespace detect { -class DetectPostprocessor { -protected: - const float score_threshold; /* box_list; /* &model_outputs_map) = 0; - void nms(); - void set_resize_scale_x(float resize_scale_x) { this->resize_scale_x = resize_scale_x; }; - void set_resize_scale_y(float resize_scale_y) { this->resize_scale_y = resize_scale_y; }; - void set_top_left_x(float top_left_x) { this->top_left_x = top_left_x; }; - void set_top_left_y(float top_left_y) { this->top_left_y = top_left_y; }; - void clear_result() { this->box_list.clear(); }; - std::list &get_result(const std::vector &input_shape); -}; - -class AnchorPointDetectPostprocessor : public DetectPostprocessor { -protected: - std::vector stages; - -public: - AnchorPointDetectPostprocessor(const float score_threshold, - const float nms_threshold, - const int top_k, - const std::vector &stages) : - DetectPostprocessor(score_threshold, nms_threshold, top_k), stages(stages) {}; -}; - -class AnchorBoxDetectPostprocessor : public DetectPostprocessor { -protected: - std::vector stages; - -public: - AnchorBoxDetectPostprocessor(const float score_threshold, - const float nms_threshold, - const int top_k, - const std::vector &stages) : - DetectPostprocessor(score_threshold, nms_threshold, top_k), stages(stages) {}; -}; -} // namespace detect -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/image/dl_image.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/image/dl_image.cpp deleted file mode 100644 index e8a5377b..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/image/dl_image.cpp +++ /dev/null @@ -1,1696 +0,0 @@ -#include "dl_image.hpp" -#include "esp_log.h" -#include -#include - -namespace dl { -namespace image { -template -void crop_and_resize(T *dst_image, - int dst_width, - int dst_channel, - int dst_y_start, - int dst_y_end, - int dst_x_start, - int dst_x_end, - uint16_t *src_image, - int src_height, - int src_width, - int src_channel, - int src_y_start, - int src_y_end, - int src_x_start, - int src_x_end, - resize_type_t resize_type, - bool byte_swap, - bool rgb_swap) -{ - assert(src_channel == 3); - assert(dst_y_start >= 0); - assert(dst_x_start >= 0); - - float scale_y = (float)(src_y_end - src_y_start) / (dst_y_end - dst_y_start); - float scale_x = (float)(src_x_end - src_x_start) / (dst_x_end - dst_x_start); - int temp[13]; - - switch (resize_type) { - case IMAGE_RESIZE_BILINEAR: - for (size_t y = dst_y_start; y < dst_y_end; y++) { - float ratio_y[2]; - ratio_y[0] = (float)(((y - dst_y_start) + 0.5) * scale_y - 0.5); // y - int src_y = (int)ratio_y[0]; // y1 - ratio_y[0] -= src_y; // y - y1 - src_y += src_y_start; - - if (src_y < 0) { - ratio_y[0] = 0; - src_y = 0; - } else if (src_y > src_height - 2) { - ratio_y[0] = 0; - src_y = src_height - 2; - } - ratio_y[1] = 1 - ratio_y[0]; // y2 - y - - int _dst_i = y * dst_width; - - int _src_row_0 = src_y * src_width; - int _src_row_1 = _src_row_0 + src_width; - - for (size_t x = dst_x_start; x < dst_x_end; x++) { - float ratio_x[2]; - ratio_x[0] = (float)(((x - dst_x_start) + 0.5) * scale_x - 0.5); // x - int src_x = (int)ratio_x[0]; // x1 - ratio_x[0] -= src_x; // x - x1 - src_x += src_x_start; - - if (src_x < 0) { - ratio_x[0] = 0; - src_x = 0; - } else if (src_x > src_width - 2) { - ratio_x[0] = 0; - src_x = src_width - 2; - } - ratio_x[1] = 1 - ratio_x[0]; // x2 - x - - int dst_i = (_dst_i + x) * dst_channel; - - int src_row_0 = _src_row_0 + src_x; - int src_row_1 = _src_row_1 + src_x; - - convert_pixel_rgb565_to_rgb888(src_image[src_row_0], temp, byte_swap); - convert_pixel_rgb565_to_rgb888(src_image[src_row_0 + 1], temp + 3, byte_swap); - convert_pixel_rgb565_to_rgb888(src_image[src_row_1], temp + 6, byte_swap); - convert_pixel_rgb565_to_rgb888(src_image[src_row_1 + 1], temp + 9, byte_swap); - - if (dst_channel == 3) { - if (rgb_swap) { - dst_image[dst_i] = (int)(temp[2] * ratio_x[1] * ratio_y[1] // - + temp[5] * ratio_x[0] * ratio_y[1] // - + temp[8] * ratio_x[1] * ratio_y[0] // - + temp[11] * ratio_x[0] * ratio_y[0] // - + 0.5); // - - dst_image[dst_i + 1] = (int)(temp[1] * ratio_x[1] * ratio_y[1] // - + temp[4] * ratio_x[0] * ratio_y[1] // - + temp[7] * ratio_x[1] * ratio_y[0] // - + temp[10] * ratio_x[0] * ratio_y[0] // - + 0.5); // - - dst_image[dst_i + 2] = (int)(temp[0] * ratio_x[1] * ratio_y[1] // - + temp[3] * ratio_x[0] * ratio_y[1] // - + temp[6] * ratio_x[1] * ratio_y[0] // - + temp[9] * ratio_x[0] * ratio_y[0] // - + 0.5); // - } else { - dst_image[dst_i] = (int)(temp[0] * ratio_x[1] * ratio_y[1] // - + temp[3] * ratio_x[0] * ratio_y[1] // - + temp[6] * ratio_x[1] * ratio_y[0] // - + temp[9] * ratio_x[0] * ratio_y[0] // - + 0.5); // - - dst_image[dst_i + 1] = (int)(temp[1] * ratio_x[1] * ratio_y[1] // - + temp[4] * ratio_x[0] * ratio_y[1] // - + temp[7] * ratio_x[1] * ratio_y[0] // - + temp[10] * ratio_x[0] * ratio_y[0] // - + 0.5); // - - dst_image[dst_i + 2] = (int)(temp[2] * ratio_x[1] * ratio_y[1] // - + temp[5] * ratio_x[0] * ratio_y[1] // - + temp[8] * ratio_x[1] * ratio_y[0] // - + temp[11] * ratio_x[0] * ratio_y[0] // - + 0.5); // - } - } else if (dst_channel == 1) // RGB -> Gray - { - int blue, green, red; - if (rgb_swap) { - blue = (int)(temp[2] * ratio_x[1] * ratio_y[1] // - + temp[5] * ratio_x[0] * ratio_y[1] // - + temp[8] * ratio_x[1] * ratio_y[0] // - + temp[11] * ratio_x[0] * ratio_y[0] // - + 0.5); // - green = (int)(temp[1] * ratio_x[1] * ratio_y[1] // - + temp[4] * ratio_x[0] * ratio_y[1] // - + temp[7] * ratio_x[1] * ratio_y[0] // - + temp[10] * ratio_x[0] * ratio_y[0] // - + 0.5); // - red = (int)(temp[0] * ratio_x[1] * ratio_y[1] // - + temp[3] * ratio_x[0] * ratio_y[1] // - + temp[6] * ratio_x[1] * ratio_y[0] // - + temp[9] * ratio_x[0] * ratio_y[0] // - + 0.5); // - } else { - blue = (int)(temp[0] * ratio_x[1] * ratio_y[1] // - + temp[3] * ratio_x[0] * ratio_y[1] // - + temp[6] * ratio_x[1] * ratio_y[0] // - + temp[9] * ratio_x[0] * ratio_y[0] // - + 0.5); // - green = (int)(temp[1] * ratio_x[1] * ratio_y[1] // - + temp[4] * ratio_x[0] * ratio_y[1] // - + temp[7] * ratio_x[1] * ratio_y[0] // - + temp[10] * ratio_x[0] * ratio_y[0] // - + 0.5); // - red = (int)(temp[2] * ratio_x[1] * ratio_y[1] // - + temp[5] * ratio_x[0] * ratio_y[1] // - + temp[8] * ratio_x[1] * ratio_y[0] // - + temp[11] * ratio_x[0] * ratio_y[0] // - + 0.5); // - } - dst_image[dst_i] = convert_pixel_rgb888_to_gray(red, green, blue); - } else { - printf("Not implement dst_channel = %d\n", dst_channel); - } - } - } - break; - - case IMAGE_RESIZE_MEAN: - for (int y = dst_y_start; y < dst_y_end; y++) { - int _dst_i = y * dst_width; - - float src_y = rintf((y - dst_y_start) * scale_y + src_y_start); - src_y = DL_CLIP(src_y, 0, src_height - 2); - float _src_row_0 = src_y * src_width; - float _src_row_1 = _src_row_0 + src_width; - - for (int x = dst_x_start; x < dst_x_end; x++) { - int dst_i = (_dst_i + x) * dst_channel; - - float src_x = rintf((x - dst_x_start) * scale_x) + src_x_start; - src_x = DL_CLIP(src_x, 0, src_width - 2); - int src_row_0 = _src_row_0 + src_x; - int src_row_1 = _src_row_1 + src_x; - - convert_pixel_rgb565_to_rgb888(src_image[src_row_0], temp, byte_swap); - convert_pixel_rgb565_to_rgb888(src_image[src_row_0 + 1], temp + 3, byte_swap); - convert_pixel_rgb565_to_rgb888(src_image[src_row_1], temp + 6, byte_swap); - convert_pixel_rgb565_to_rgb888(src_image[src_row_1 + 1], temp + 9, byte_swap); - - if (dst_channel == 3) { - if (rgb_swap) { - dst_image[dst_i] = (temp[2] + temp[5] + temp[8] + temp[11]) >> 2; // blue - dst_image[dst_i + 1] = (temp[1] + temp[4] + temp[7] + temp[10]) >> 2; // green - dst_image[dst_i + 2] = (temp[0] + temp[3] + temp[6] + temp[9]) >> 2; // red - } else { - dst_image[dst_i] = (temp[0] + temp[3] + temp[6] + temp[9]) >> 2; // blue - dst_image[dst_i + 1] = (temp[1] + temp[4] + temp[7] + temp[10]) >> 2; // green - dst_image[dst_i + 2] = (temp[2] + temp[5] + temp[8] + temp[11]) >> 2; // red - } - } else if (dst_channel == 1) { - int blue, green, red; - if (rgb_swap) { - blue = (temp[2] + temp[5] + temp[8] + temp[11]) >> 2; - green = (temp[1] + temp[4] + temp[7] + temp[10]) >> 2; - red = (temp[0] + temp[3] + temp[6] + temp[9]) >> 2; - } else { - blue = (temp[0] + temp[3] + temp[6] + temp[9]) >> 2; - green = (temp[1] + temp[4] + temp[7] + temp[10]) >> 2; - red = (temp[2] + temp[5] + temp[8] + temp[11]) >> 2; - } - dst_image[dst_i] = convert_pixel_rgb888_to_gray(red, green, blue); - } else { - printf("Not implement dst_channel = %d\n", dst_channel); - } - } - } - break; - - case IMAGE_RESIZE_NEAREST: - for (size_t y = dst_y_start; y < dst_y_end; y++) { - int _dst_i = y * dst_width; - - float src_y = rintf((y - dst_y_start) * scale_y + src_y_start); - src_y = DL_CLIP(src_y, 0, src_height - 1); - float _src_i = src_y * src_width; - - for (size_t x = dst_x_start; x < dst_x_end; x++) { - int dst_i = (_dst_i + x) * dst_channel; - - float src_x = rintf((x - dst_x_start) * scale_x) + src_x_start; - src_x = DL_CLIP(src_x, 0, src_width - 1); - int src_i = _src_i + src_x; - - convert_pixel_rgb565_to_rgb888(src_image[src_i], temp, byte_swap); - - if (dst_channel == 3) { - if (rgb_swap) { - dst_image[dst_i] = temp[2]; - dst_image[dst_i + 1] = temp[1]; - dst_image[dst_i + 2] = temp[0]; - } else { - dst_image[dst_i] = temp[0]; - dst_image[dst_i + 1] = temp[1]; - dst_image[dst_i + 2] = temp[2]; - } - } else if (dst_channel == 1) // RGB -> Gray - { - int blue, green, red; - if (rgb_swap) { - blue = temp[2]; - green = temp[1]; - red = temp[0]; - } else { - blue = temp[0]; - green = temp[1]; - red = temp[2]; - } - dst_image[dst_i] = convert_pixel_rgb888_to_gray(red, green, blue); - } else { - printf("Not implement dst_channel = %d\n", dst_channel); - } - } - } - break; - - default: - printf("Not implement image_resize_type = %d\n", resize_type); - break; - } -} -template void crop_and_resize(uint8_t *dst_image, - int dst_width, - int dst_channel, - int dst_y_start, - int dst_y_end, - int dst_x_start, - int dst_x_end, - uint16_t *src_image, - int src_height, - int src_width, - int src_channel, - int src_y_start, - int src_y_end, - int src_x_start, - int src_x_end, - resize_type_t resize_type, - bool rgb_swap, - bool byte_swap); -template void crop_and_resize(int16_t *dst_image, - int dst_width, - int dst_channel, - int dst_y_start, - int dst_y_end, - int dst_x_start, - int dst_x_end, - uint16_t *src_image, - int src_height, - int src_width, - int src_channel, - int src_y_start, - int src_y_end, - int src_x_start, - int src_x_end, - resize_type_t resize_type, - bool rgb_swap, - bool byte_swap); -template void crop_and_resize(int8_t *dst_image, - int dst_width, - int dst_channel, - int dst_y_start, - int dst_y_end, - int dst_x_start, - int dst_x_end, - uint16_t *src_image, - int src_height, - int src_width, - int src_channel, - int src_y_start, - int src_y_end, - int src_x_start, - int src_x_end, - resize_type_t resize_type, - bool rgb_swap, - bool byte_swap); - -template -void crop_and_resize(T *dst_image, - int dst_width, - int dst_channel, - int dst_y_start, - int dst_y_end, - int dst_x_start, - int dst_x_end, - uint8_t *src_image, - int src_height, - int src_width, - int src_channel, - int src_y_start, - int src_y_end, - int src_x_start, - int src_x_end, - resize_type_t resize_type, - bool rgb_swap) -{ - assert(dst_y_start >= 0); - assert(dst_x_start >= 0); - - float scale_y = (float)(src_y_end - src_y_start) / (dst_y_end - dst_y_start); - float scale_x = (float)(src_x_end - src_x_start) / (dst_x_end - dst_x_start); - int temp; - - switch (resize_type) { - case IMAGE_RESIZE_BILINEAR: - for (size_t y = dst_y_start; y < dst_y_end; y++) { - float ratio_y[2]; - ratio_y[0] = (float)(((y - dst_y_start) + 0.5) * scale_y - 0.5); // y - int src_y = (int)ratio_y[0]; // y1 - ratio_y[0] -= src_y; // y - y1 - src_y += src_y_start; - - if (src_y < 0) { - ratio_y[0] = 0; - src_y = 0; - } - if (src_y > src_height - 2) { - ratio_y[0] = 0; - src_y = src_height - 2; - } - ratio_y[1] = 1 - ratio_y[0]; // y2 - y - - int _dst_i = y * dst_width; - - int _src_row_0 = src_y * src_width; - int _src_row_1 = _src_row_0 + src_width; - - for (size_t x = dst_x_start; x < dst_x_end; x++) { - float ratio_x[2]; - ratio_x[0] = (float)(((x - dst_x_start) + 0.5) * scale_x - 0.5); // x - int src_x = (int)ratio_x[0]; // x1 - ratio_x[0] -= src_x; // x - x1 - src_x += src_x_start; - - if (src_x < 0) { - ratio_x[0] = 0; - src_x = 0; - } - if (src_x > src_width - 2) { - ratio_x[0] = 0; - src_x = src_width - 2; - } - ratio_x[1] = 1 - ratio_x[0]; // x2 - x - - int dst_i = (_dst_i + x) * dst_channel; - - int src_row_0 = (_src_row_0 + src_x) * src_channel; - int src_row_1 = (_src_row_1 + src_x) * src_channel; - - if (src_channel == dst_channel) { - for (int c = 0; c < dst_channel; c++) { - temp = round(src_image[src_row_0 + c] * ratio_x[1] * ratio_y[1] // - + src_image[src_row_0 + src_channel + c] * ratio_x[0] * ratio_y[1] // - + src_image[src_row_1 + c] * ratio_x[1] * ratio_y[0] // - + src_image[src_row_1 + src_channel + c] * ratio_x[0] * ratio_y[0]); // - if (rgb_swap) - dst_image[dst_i + dst_channel - 1 - c] = temp; - else - dst_image[dst_i + c] = temp; - } - } else if (src_channel == 3 && dst_channel == 1) // RGB -> Gray - { - int blue = round(src_image[src_row_0] * ratio_x[1] * ratio_y[1] // - + src_image[src_row_0 + src_channel] * ratio_x[0] * ratio_y[1] // - + src_image[src_row_1] * ratio_x[1] * ratio_y[0] // - + src_image[src_row_1 + src_channel] * ratio_x[0] * ratio_y[0]); // - - int green = round(src_image[src_row_0 + 1] * ratio_x[1] * ratio_y[1] // - + src_image[src_row_0 + src_channel + 1] * ratio_x[0] * ratio_y[1] // - + src_image[src_row_1 + 1] * ratio_x[1] * ratio_y[0] // - + src_image[src_row_1 + src_channel + 1] * ratio_x[0] * ratio_y[0]); // - - int red = round(src_image[src_row_0 + 2] * ratio_x[1] * ratio_y[1] // - + src_image[src_row_0 + src_channel + 2] * ratio_x[0] * ratio_y[1] // - + src_image[src_row_1 + 2] * ratio_x[1] * ratio_y[0] // - + src_image[src_row_1 + src_channel + 2] * ratio_x[0] * ratio_y[0]); // - if (rgb_swap) - dst_image[dst_i] = convert_pixel_rgb888_to_gray(blue, green, red); - else - dst_image[dst_i] = convert_pixel_rgb888_to_gray(red, green, blue); - } else { - printf("Not implement src_channel = %d and dst_channel = %d\n", src_channel, dst_channel); - } - } - } - break; - - case IMAGE_RESIZE_MEAN: - - for (size_t y = dst_y_start; y < dst_y_end; y++) { - int _dst_i = y * dst_width; - - float src_y = rintf((y - dst_y_start) * scale_y + src_y_start); - src_y = DL_CLIP(src_y, 0, src_height - 2); - float _src_row_0 = src_y * src_width; - float _src_row_1 = _src_row_0 + src_width; - - for (size_t x = dst_x_start; x < dst_x_end; x++) { - int dst_i = (_dst_i + x) * dst_channel; - - float src_x = rintf((x - dst_x_start) * scale_x) + src_x_start; - src_x = DL_CLIP(src_x, 0, src_width - 2); - int src_row_0 = (_src_row_0 + src_x) * src_channel; - int src_row_1 = (_src_row_1 + src_x) * src_channel; - - if (src_channel == dst_channel) { - for (size_t c = 0; c < dst_channel; c++) { - temp = (int)src_image[src_row_0 + c] + (int)src_image[src_row_0 + dst_channel + c] + - (int)src_image[src_row_1 + c] + (int)src_image[src_row_1 + dst_channel + c]; - if (rgb_swap) - dst_image[dst_i + dst_channel - 1 - c] = temp >> 2; - else - dst_image[dst_i + c] = temp >> 2; - } - } else if (src_channel == 3 && dst_channel == 1) // RGB -> Gray - { - int blue = (int)src_image[src_row_0] // - + (int)src_image[src_row_0 + dst_channel] // - + (int)src_image[src_row_1] // - + (int)src_image[src_row_1 + dst_channel]; // - int green = (int)src_image[src_row_0 + 1] // - + (int)src_image[src_row_0 + dst_channel + 1] // - + (int)src_image[src_row_1 + 1] // - + (int)src_image[src_row_1 + dst_channel + 1]; // - int red = (int)src_image[src_row_0 + 2] // - + (int)src_image[src_row_0 + dst_channel + 2] // - + (int)src_image[src_row_1 + 2] // - + (int)src_image[src_row_1 + dst_channel + 2]; // - if (rgb_swap) - dst_image[dst_i] = convert_pixel_rgb888_to_gray(blue, green, red) >> 2; - else - dst_image[dst_i] = convert_pixel_rgb888_to_gray(red, green, blue) >> 2; - } else { - printf("Not implement src_channel = %d and dst_channel = %d\n", src_channel, dst_channel); - } - } - } - break; - - case IMAGE_RESIZE_NEAREST: - for (size_t y = dst_y_start; y < dst_y_end; y++) { - int _dst_i = y * dst_width; - - float src_y = rintf((y - dst_y_start) * scale_y + src_y_start); - src_y = DL_CLIP(src_y, 0, src_height - 1); - float _src_i = src_y * src_width; - - for (size_t x = dst_x_start; x < dst_x_end; x++) { - int dst_i = (_dst_i + x) * dst_channel; - - float src_x = rintf((x - dst_x_start) * scale_x) + src_x_start; - src_x = DL_CLIP(src_x, 0, src_width - 1); - int src_i = (_src_i + src_x) * dst_channel; - - if (src_channel == dst_channel) { - for (size_t c = 0; c < dst_channel; c++) { - if (rgb_swap) - dst_image[dst_i + dst_channel - 1 - c] = (T)src_image[src_i + c]; - else - dst_image[dst_i + c] = (T)src_image[src_i + c]; - } - } else if (src_channel == 3 && dst_channel == 1) // RGB -> Gray - { - if (rgb_swap) - dst_image[dst_i] = - convert_pixel_rgb888_to_gray(src_image[src_i], src_image[src_i + 1], src_image[src_i + 2]); - else - dst_image[dst_i] = - convert_pixel_rgb888_to_gray(src_image[src_i + 2], src_image[src_i + 1], src_image[src_i]); - } else { - printf("Not implement src_channel = %d and dst_channel = %d\n", src_channel, dst_channel); - } - } - } - break; - - default: - printf("Not implement image_resize_type = %d\n", resize_type); - break; - } -} -template void crop_and_resize(uint8_t *dst_image, - int dst_width, - int dst_channel, - int dst_y_start, - int dst_y_end, - int dst_x_start, - int dst_x_end, - uint8_t *src_image, - int src_height, - int src_width, - int src_channel, - int src_y_start, - int src_y_end, - int src_x_start, - int src_x_end, - resize_type_t resize_type, - bool rgb_swap); -template void crop_and_resize(int16_t *dst_image, - int dst_width, - int dst_channel, - int dst_y_start, - int dst_y_end, - int dst_x_start, - int dst_x_end, - uint8_t *src_image, - int src_height, - int src_width, - int src_channel, - int src_y_start, - int src_y_end, - int src_x_start, - int src_x_end, - resize_type_t resize_type, - bool rgb_swap); -template void crop_and_resize(int8_t *dst_image, - int dst_width, - int dst_channel, - int dst_y_start, - int dst_y_end, - int dst_x_start, - int dst_x_end, - uint8_t *src_image, - int src_height, - int src_width, - int src_channel, - int src_y_start, - int src_y_end, - int src_x_start, - int src_x_end, - resize_type_t resize_type, - bool rgb_swap); - -void draw_filled_rectangle(uint8_t *image, - const uint32_t image_height, - const uint32_t image_width, - uint32_t x1, - uint32_t y1, - uint32_t x2, - uint32_t y2, - const uint32_t color) -{ - assert(x2 >= x1); - assert(y2 >= y1); - - x1 = DL_MIN(x1, image_width - 1); - y1 = DL_MIN(y1, image_height - 1); - x2 = DL_MIN(x2, image_width - 1); - y2 = DL_MIN(y2, image_height - 1); - - uint8_t c0 = color >> 16; - uint8_t c1 = color >> 8; - uint8_t c2 = color; - - uint8_t *ptr = image + (y1 * image_width + x1) * 3; - uint32_t offset = image_width * 3; - for (int y = y1; y <= y2; y++) { - uint8_t *row = ptr; - for (int x = x1; x <= x2; x++) { - row[0] = c0; - row[1] = c1; - row[2] = c2; - row += 3; - } - ptr += offset; - } -} - -void draw_filled_rectangle(uint16_t *image, - const uint32_t image_height, - const uint32_t image_width, - uint32_t x1, - uint32_t y1, - uint32_t x2, - uint32_t y2, - const uint16_t color) -{ - assert(x2 >= x1); - assert(y2 >= y1); - - x1 = DL_MIN(x1, image_width - 1); - y1 = DL_MIN(y1, image_height - 1); - x2 = DL_MIN(x2, image_width - 1); - y2 = DL_MIN(y2, image_height - 1); - - uint16_t *ptr = image + y1 * image_width + x1; - for (int y = y1; y <= y2; y++) { - uint16_t *row = ptr; - - for (int x = x1; x <= x2; x++) { - *row = color; - row++; - } - ptr += image_width; - } -} - -void draw_point(uint8_t *image, - const uint32_t image_height, - const uint32_t image_width, - const uint32_t x, - const uint32_t y, - const uint32_t size, - const uint32_t color) -{ - int half_size = size >> 1; - draw_filled_rectangle(image, - image_height, - image_width, - DL_MAX((int)x - half_size, 0), - DL_MAX((int)y - half_size, 0), - x + half_size, - y + half_size, - color); -} - -void draw_point(uint16_t *image, - const uint32_t image_height, - const uint32_t image_width, - const uint32_t x, - const uint32_t y, - const uint32_t size, - uint16_t color) -{ - int half_size = size >> 1; - draw_filled_rectangle(image, - image_height, - image_width, - DL_MAX((int)x - half_size, 0), - DL_MAX((int)y - half_size, 0), - x + half_size, - y + half_size, - color); -} - -void draw_hollow_rectangle(uint8_t *image, - const uint32_t image_height, - const uint32_t image_width, - uint32_t x1, - uint32_t y1, - uint32_t x2, - uint32_t y2, - uint32_t color) -{ - assert(x2 >= x1); - assert(y2 >= y1); - - x1 = DL_MIN(x1, image_width - 1); - y1 = DL_MIN(y1, image_height - 1); - x2 = DL_MIN(x2, image_width - 1); - y2 = DL_MIN(y2, image_height - 1); - - uint8_t c0 = color >> 16; - uint8_t c1 = color >> 8; - uint8_t c2 = color; - - // draw horizon - uint8_t *row_up = image + (y1 * image_width + x1) * 3; - uint8_t *row_down = image + (y2 * image_width + x1) * 3; - for (int x = x1; x <= x2; x++) { - row_up[0] = c0; - row_up[1] = c1; - row_up[2] = c2; - row_up += 3; - - row_down[0] = c0; - row_down[1] = c1; - row_down[2] = c2; - row_down += 3; - } - - // draw vertical - uint8_t *colum_left = image + (y1 * image_width + x1) * 3; - uint8_t *colum_right = image + (y1 * image_width + x2) * 3; - uint32_t offset = image_width * 3; - for (int y = y1; y <= y2; y++) { - colum_left[0] = c0; - colum_left[1] = c1; - colum_left[2] = c2; - colum_left += offset; - - colum_right[0] = c0; - colum_right[1] = c1; - colum_right[2] = c2; - colum_right += offset; - } -} - -void draw_hollow_rectangle(uint16_t *image, - const uint32_t image_height, - const uint32_t image_width, - uint32_t x1, - uint32_t y1, - uint32_t x2, - uint32_t y2, - const uint16_t color) -{ - assert(x2 >= x1); - assert(y2 >= y1); - - x1 = DL_MIN(x1, image_width - 1); - y1 = DL_MIN(y1, image_height - 1); - x2 = DL_MIN(x2, image_width - 1); - y2 = DL_MIN(y2, image_height - 1); - - // draw horizon - uint16_t *row_up = image + y1 * image_width + x1; - uint16_t *row_down = image + y2 * image_width + x1; - for (int x = x1; x <= x2; x++) { - *row_up = color; - row_up++; - - *row_down = color; - row_down++; - } - - // draw vertical - uint16_t *colum_left = image + y1 * image_width + x1; - uint16_t *colum_right = image + y1 * image_width + x2; - for (int y = y1; y <= y2; y++) { - *colum_left = color; - colum_left += image_width; - - *colum_right = color; - colum_right += image_width; - } -} - -uint32_t get_moving_point_number(uint16_t *f1, - uint16_t *f2, - const uint32_t height, - const uint32_t width, - const uint32_t stride, - const uint32_t threshold) -{ - uint32_t stride_y_offset = width * stride; - uint32_t count = 0; - for (uint32_t y = 0; y < height; y += stride) { - uint16_t *f1_row = f1; - uint16_t *f2_row = f2; - for (uint32_t x = 0; x < width; x += stride) { - int f1_gray = convert_pixel_rgb565_to_gray(*f1_row); - int f2_gray = convert_pixel_rgb565_to_gray(*f2_row); - - if (DL_ABS(f1_gray - f2_gray) > threshold) - count++; - - f1_row += stride; - f2_row += stride; - } - f1 += stride_y_offset; - f2 += stride_y_offset; - } - return count; -} - -uint32_t get_moving_point_number(uint8_t *f1, - uint8_t *f2, - const uint32_t height, - const uint32_t width, - const uint32_t stride, - const uint32_t threshold) -{ - uint32_t stride_y_offset = width * stride * 3; - uint32_t stride_x_offset = stride * 3; - uint32_t count = 0; - for (uint32_t y = 0; y < height; y += stride) { - uint8_t *f1_row = f1; - uint8_t *f2_row = f2; - for (uint32_t x = 0; x < width; x += stride) { - int f1_gray = convert_pixel_rgb888_to_gray(f1_row[2], f1_row[1], f1_row[0]); - int f2_gray = convert_pixel_rgb888_to_gray(f2_row[2], f2_row[1], f2_row[0]); - - if (DL_ABS(f1_gray - f2_gray) > threshold) - count++; - - f1_row += stride_x_offset; - f2_row += stride_x_offset; - } - f1 += stride_y_offset; - f2 += stride_y_offset; - } - return count; -} - -template -void warp_affine(uint8_t *input, - const std::vector &input_shape, - T *output, - const std::vector &output_shape, - dl::math::Matrix *M_inv, - bool byte_swap) -{ - int input_stride = input_shape[1] * input_shape[2]; // stride = w * c - int c = input_shape[2]; - int output_h = output_shape[0]; - int output_w = output_shape[1]; - - float x_src = 0.0; - float y_src = 0.0; - int x1 = 0; - int x2 = 0; - int y1 = 0; - int y2 = 0; - - for (int i = 0; i < output_h; i++) { - for (int j = 0; j < output_w; j++) { - x_src = (M_inv->array[0][0] * j + M_inv->array[0][1] * i + M_inv->array[0][2]) / - (M_inv->array[2][0] * j + M_inv->array[2][1] * i + M_inv->array[2][2]); - y_src = (M_inv->array[1][0] * j + M_inv->array[1][1] * i + M_inv->array[1][2]) / - (M_inv->array[2][0] * j + M_inv->array[2][1] * i + M_inv->array[2][2]); - if ((x_src < 0) || (y_src < 0) || (x_src >= (input_shape[1] - 1)) || (y_src >= (input_shape[0] - 1))) { - for (int k = 0; k < c; k++) { - *output++ = 0; - } - } else { - x1 = floor(x_src); - x2 = x1 + 1; - y1 = floor(y_src); - y2 = y1 + 1; - for (int k = 0; k < c; k++) { - *output++ = (T)rintf(((input[y1 * input_stride + x1 * c + k]) * (x2 - x_src) * (y2 - y_src)) + - ((input[y1 * input_stride + x2 * c + k]) * (x_src - x1) * (y2 - y_src)) + - ((input[y2 * input_stride + x1 * c + k]) * (x2 - x_src) * (y_src - y1)) + - ((input[y2 * input_stride + x2 * c + k]) * (x_src - x1) * (y_src - y1))); - } - } - } - } -} -template void warp_affine(uint8_t *input, - const std::vector &input_shape, - uint8_t *output, - const std::vector &output_shape, - dl::math::Matrix *M_inv, - bool byte_swap); -template void warp_affine(uint8_t *input, - const std::vector &input_shape, - int16_t *output, - const std::vector &output_shape, - dl::math::Matrix *M_inv, - bool byte_swap); - -template -void warp_affine(uint16_t *input, - const std::vector &input_shape, - T *output, - const std::vector &output_shape, - dl::math::Matrix *M_inv, - bool byte_swap) -{ - int input_stride = input_shape[1]; // stride = w - int c = input_shape[2]; - assert(c == 3); - int output_h = output_shape[0]; - int output_w = output_shape[1]; - - float x_src = 0.0; - float y_src = 0.0; - int x1 = 0; - int x2 = 0; - int y1 = 0; - int y2 = 0; - - uint8_t src_x1y1[3] = {0}; - uint8_t src_x1y2[3] = {0}; - uint8_t src_x2y1[3] = {0}; - uint8_t src_x2y2[3] = {0}; - - for (int i = 0; i < output_h; i++) { - for (int j = 0; j < output_w; j++) { - x_src = (M_inv->array[0][0] * j + M_inv->array[0][1] * i + M_inv->array[0][2]) / - (M_inv->array[2][0] * j + M_inv->array[2][1] * i + M_inv->array[2][2]); - y_src = (M_inv->array[1][0] * j + M_inv->array[1][1] * i + M_inv->array[1][2]) / - (M_inv->array[2][0] * j + M_inv->array[2][1] * i + M_inv->array[2][2]); - if ((x_src < 0) || (y_src < 0) || (x_src >= (input_shape[1] - 1)) || (y_src >= (input_shape[0] - 1))) { - for (int k = 0; k < c; k++) { - *output++ = 0; - } - } else { - x1 = floor(x_src); - x2 = x1 + 1; - y1 = floor(y_src); - y2 = y1 + 1; - - dl::image::convert_pixel_rgb565_to_rgb888(input[y1 * input_stride + x1], src_x1y1); - dl::image::convert_pixel_rgb565_to_rgb888(input[y2 * input_stride + x1], src_x1y2); - dl::image::convert_pixel_rgb565_to_rgb888(input[y1 * input_stride + x2], src_x2y1); - dl::image::convert_pixel_rgb565_to_rgb888(input[y2 * input_stride + x2], src_x2y2); - - *output++ = - (T)rintf((src_x1y1[0] * (x2 - x_src) * (y2 - y_src)) + (src_x2y1[0] * (x_src - x1) * (y2 - y_src)) + - (src_x1y2[0] * (x2 - x_src) * (y_src - y1)) + (src_x2y2[0] * (x_src - x1) * (y_src - y1))); - *output++ = - (T)rintf((src_x1y1[1] * (x2 - x_src) * (y2 - y_src)) + (src_x2y1[1] * (x_src - x1) * (y2 - y_src)) + - (src_x1y2[1] * (x2 - x_src) * (y_src - y1)) + (src_x2y2[1] * (x_src - x1) * (y_src - y1))); - *output++ = - (T)rintf((src_x1y1[2] * (x2 - x_src) * (y2 - y_src)) + (src_x2y1[2] * (x_src - x1) * (y2 - y_src)) + - (src_x1y2[2] * (x2 - x_src) * (y_src - y1)) + (src_x2y2[2] * (x_src - x1) * (y_src - y1))); - } - } - } -} -template void warp_affine(uint16_t *input, - const std::vector &input_shape, - uint8_t *output, - const std::vector &output_shape, - dl::math::Matrix *M_inv, - bool byte_swap); -template void warp_affine(uint16_t *input, - const std::vector &input_shape, - int16_t *output, - const std::vector &output_shape, - dl::math::Matrix *M_inv, - bool byte_swap); - -uint8_t get_otsu_thresh(Tensor &image) -{ - if (image.shape.size() == 3) { - assert(image.shape[2] == 1); - } else { - assert(image.shape.size() == 2); - } - int numPixels = image.get_size(); - - const int HISTOGRAM_SIZE = 256; - unsigned int histogram[HISTOGRAM_SIZE]; - memset(histogram, 0, (HISTOGRAM_SIZE) * sizeof(unsigned int)); - uint8_t *ptr = image.element; - int length = numPixels; - while (length--) { - uint8_t value = *ptr++; - histogram[value]++; - } - - int sum = 0; - for (int i = 0; i < HISTOGRAM_SIZE; ++i) { - sum += i * histogram[i]; - } - - int sumB = 0; - int q1 = 0; - double max = 0; - uint8_t threshold = 0; - for (int i = 0; i < HISTOGRAM_SIZE; ++i) { - q1 += histogram[i]; - if (q1 == 0) - continue; - - const int q2 = numPixels - q1; - if (q2 == 0) - break; - - sumB += i * histogram[i]; - const double m1 = (double)sumB / q1; - const double m2 = ((double)sum - sumB) / q2; - const double m1m2 = m1 - m2; - const double variance = m1m2 * m1m2 * q1 * q2; - if (variance > max) { - threshold = i; - max = variance; - } - } - - return threshold; -} - -Tensor *rgb2gray(Tensor &image, bool bgr) -{ - assert(image.shape.size() == 3); - assert(image.shape[2] == 3); - - Tensor *gray = new Tensor; - gray->set_shape({image.shape[0], image.shape[1], 1}).malloc_element(); - int count = gray->get_size(); - uint8_t *r = NULL; - uint8_t *g = NULL; - uint8_t *b = NULL; - if (bgr) { - b = image.element; - g = b + 1; - r = b + 2; - } else { - r = image.element; - g = r + 1; - b = r + 2; - } - - uint8_t *pgray = gray->element; - int x = 0; - for (int i = 0; i < count; ++i) { - // TODO: use tie instructions. - x = (19595 * (*r) + 38469 * (*g) + 7472 * (*b)) >> 16; // fast algorithm - // Gray = R*0.299 + G*0.587 + B*0.114 - // Gray = (R*30 + G*59 + B*11 + 50) / 100 - // Gray = (R*38 + G*75 + B*15) >> 7 - - *(pgray++) = (uint8_t)x; - - r += 3; - g += 3; - b += 3; - } - return gray; -} - -Tensor *rgb2lab(Tensor &image, bool bgr, bool fast) -{ - assert(image.shape.size() == 3); - assert(image.shape[2] == 3); - - Tensor *lab = new Tensor; - lab->set_shape(image.shape).malloc_element(); - int count = image.shape[0] * image.shape[1]; - uint8_t *r = NULL; - uint8_t *g = NULL; - uint8_t *b = NULL; - if (bgr) { - b = image.element; - g = b + 1; - r = b + 2; - } else { - r = image.element; - g = r + 1; - b = r + 2; - } - - if (fast) { - int x, y, z; - uint8_t *plab = lab->element; - for (int i = 0; i < count; ++i) { - // TODO: use tie - x = (13933 * (*r) + 46871 * (*g) + 4732 * (*b)) >> 16; - y = (((5467631 * (*r) - 8376186 * (*g) + 2908178 * (*b))) >> 24) + 128; - z = (((2043680 * (*r) + 6351200 * (*g) - 8394880 * (*b))) >> 24) + 128; - - *(plab++) = (uint8_t)x; - *(plab++) = (uint8_t)y; - *(plab++) = (uint8_t)z; - - r += 3; - g += 3; - b += 3; - } - return lab; - } else { - float x, y, z; - uint8_t *plab = lab->element; - for (int i = 0; i < count; ++i) { - x = (0.433953 * (*r) + 0.376219 * (*g) + 0.189828 * (*b)) / 255; - y = (0.212671 * (*r) + 0.715160 * (*g) + 0.072169 * (*b)) / 255; - z = (0.017758 * (*r) + 0.109476 * (*g) + 0.872766 * (*b)) / 255; - - x = (x > 0.008856) ? pow(x, 1.0 / 3) : (7.787037 * x + 0.137931); - y = (y > 0.008856) ? pow(y, 1.0 / 3) : (7.787037 * y + 0.137931); - z = (z > 0.008856) ? pow(z, 1.0 / 3) : (7.787037 * z + 0.137931); - - *(plab++) = (uint8_t)(116 * y - 16); - *(plab++) = (uint8_t)(500 * (x - y) + 128); - *(plab++) = (uint8_t)(200 * (y - z) + 128); - - r += 3; - g += 3; - b += 3; - } - return lab; - } -} - -Tensor *rgb2hsv(Tensor &image, bool bgr, bool fast) -{ - assert(image.shape.size() == 3); - assert(image.shape[2] == 3); - - Tensor *hsv = new Tensor; - hsv->set_shape(image.shape).malloc_element(); - int count = image.shape[0] * image.shape[1]; - uint8_t *r = NULL; - uint8_t *g = NULL; - uint8_t *b = NULL; - if (bgr) { - b = image.element; - g = b + 1; - r = b + 2; - } else { - r = image.element; - g = r + 1; - b = r + 2; - } - if (fast) { - int h, s, v, min_rgb, delta; - uint8_t *phsv = hsv->element; - for (int i = 0; i < count; ++i) { - v = DL_MAX(DL_MAX(*r, *g), *b); - min_rgb = DL_MIN(DL_MIN(*r, *g), *b); - if (v == min_rgb) { - *(phsv++) = 0; - *(phsv++) = 0; - *(phsv++) = (uint8_t)(v); - } else { - delta = v - min_rgb; - s = (delta * 255) / v; - if (v == (*r)) { - h = (60 * ((*g) - (*b)) / (delta)) >> 1; - h = (h < 0) ? (h + 180) : h; - } else if (v == (*g)) { - h = (120 + 60 * ((*b) - (*r)) / delta) >> 1; - } else { - h = (240 + 60 * ((*r) - (*g)) / delta) >> 1; - } - *(phsv++) = (uint8_t)(h); - *(phsv++) = (uint8_t)(s); - *(phsv++) = (uint8_t)(v); - } - - r += 3; - g += 3; - b += 3; - } - - return hsv; - } else { - float h, s, v, min_rgb; - uint8_t *phsv = hsv->element; - float h_scale = 180.0 / 360.0; - for (int i = 0; i < count; ++i) { - v = DL_MAX(DL_MAX(*r, *g), *b); - min_rgb = DL_MIN(DL_MIN(*r, *g), *b); - if (v == min_rgb) { - *(phsv++) = 0; - *(phsv++) = 0; - *(phsv++) = (uint8_t)(v); - } else { - s = (v - min_rgb) * 255.0 / v; - if (v == (*r)) { - h = h_scale * 60.0 * ((*g) - (*b)) / (v - min_rgb); - h = (h < 0) ? (h + 180) : h; - } else if (v == (*g)) { - h = h_scale * (120.0 + 60.0 * ((*b) - (*r)) / (v - min_rgb)); - } else { - h = h_scale * (240.0 + 60.0 * ((*r) - (*g)) / (v - min_rgb)); - } - *(phsv++) = (uint8_t)(h); - *(phsv++) = (uint8_t)(s); - *(phsv++) = (uint8_t)(v); - } - - r += 3; - g += 3; - b += 3; - } - - return hsv; - } -} - -Tensor *convert_image_rgb565_to_rgb888(uint16_t *image, std::vector &image_shape) -{ - Tensor *rgb = new Tensor; - rgb->set_shape({image_shape[0], image_shape[1], 3}).malloc_element(); - int count = image_shape[0] * image_shape[1]; - uint8_t *element_ptr = rgb->element; - for (int i = 0; i < count; ++i) { - convert_pixel_rgb565_to_rgb888(image[i], element_ptr); - element_ptr += 3; - } - return rgb; -} - -Tensor *gen_binary_img(Tensor &image, std::vector thresh) -{ - assert(image.shape.size() == 3); - assert(image.shape[2] == 3); - assert(thresh.size() == 6); - Tensor *bin = new Tensor; - bin->set_shape({image.shape[0], image.shape[1], 1}).malloc_element(); - uint8_t *c1 = image.element; - uint8_t *c2 = c1 + 1; - uint8_t *c3 = c1 + 2; - uint8_t *pbin = bin->element; - int count = bin->get_size(); - // int num = 0; - for (int i = 0; i < count; i++) { - if (((*c1) >= thresh[0]) && ((*c1) <= thresh[1]) && ((*c2) >= thresh[2]) && ((*c2) <= thresh[3]) && - ((*c3) >= thresh[4]) && ((*c3) <= thresh[5])) { - *(pbin++) = 255; - // num++; - } else { - *(pbin++) = 0; - } - c1 += 3; - c2 += 3; - c3 += 3; - } - - return bin; -} - -Tensor *resize_image(Tensor &image, std::vector target_shape, resize_type_t resize_type) -{ - assert(image.shape.size() == 3); - assert(target_shape.size() == 3); - Tensor *resized_image = new Tensor; - resized_image->set_shape({target_shape[0], target_shape[1], image.shape[2]}); - float h_ratio = (float)(image.shape[0]) / target_shape[0]; - float w_ratio = (float)(image.shape[1]) / target_shape[1]; - - if (image.shape.back() == 3) { - resized_image->malloc_element(); - uint8_t *resized_ptr = resized_image->element; - float h_origin = 0; - float w_origin = 0; - if (resize_type == IMAGE_RESIZE_BILINEAR) { - for (int h = 0; h < target_shape[0]; ++h) { - h_origin = h * h_ratio; - int h1 = (int)h_origin; - int h2 = h1 + 1; - float h1_weight = (float)h2 - h_origin; - float h2_weight = h_origin - (float)h1; - for (int w = 0; w < target_shape[1]; ++w) { - w_origin = w * w_ratio; - int w1 = (int)w_origin; - int w2 = w1 + 1; - float w1_weight = (float)w2 - w_origin; - float w2_weight = w_origin - (float)w1; - resized_ptr[0] = (uint8_t)(image.get_element_value({h1, w1, 0}) * h1_weight * w1_weight + - image.get_element_value({h1, w2, 0}) * h1_weight * w2_weight + - image.get_element_value({h2, w1, 0}) * h2_weight * w1_weight + - image.get_element_value({h2, w2, 0}) * h2_weight * w2_weight); - resized_ptr[1] = (uint8_t)(image.get_element_value({h1, w1, 1}) * h1_weight * w1_weight + - image.get_element_value({h1, w2, 1}) * h1_weight * w2_weight + - image.get_element_value({h2, w1, 1}) * h2_weight * w1_weight + - image.get_element_value({h2, w2, 1}) * h2_weight * w2_weight); - resized_ptr[2] = (uint8_t)(image.get_element_value({h1, w1, 2}) * h1_weight * w1_weight + - image.get_element_value({h1, w2, 2}) * h1_weight * w2_weight + - image.get_element_value({h2, w1, 2}) * h2_weight * w1_weight + - image.get_element_value({h2, w2, 2}) * h2_weight * w2_weight); - resized_ptr += 3; - } - } - return resized_image; - } else if (resize_type == IMAGE_RESIZE_MEAN) { - for (int h = 0; h < target_shape[0]; ++h) { - h_origin = h * h_ratio; - int h1 = (int)h_origin; - int h2 = h1 + 1; - for (int w = 0; w < target_shape[1]; ++w) { - w_origin = w * w_ratio; - int w1 = (int)w_origin; - int w2 = w1 + 1; - resized_ptr[0] = (uint8_t)(((int)image.get_element_value({h1, w1, 0}) + - (int)image.get_element_value({h1, w2, 0}) + - (int)image.get_element_value({h2, w1, 0}) + - (int)image.get_element_value({h2, w2, 0})) >> - 2); - resized_ptr[1] = (uint8_t)(((int)image.get_element_value({h1, w1, 1}) + - (int)image.get_element_value({h1, w2, 1}) + - (int)image.get_element_value({h2, w1, 1}) + - (int)image.get_element_value({h2, w2, 1})) >> - 2); - resized_ptr[2] = (uint8_t)(((int)image.get_element_value({h1, w1, 2}) + - (int)image.get_element_value({h1, w2, 2}) + - (int)image.get_element_value({h2, w1, 2}) + - (int)image.get_element_value({h2, w2, 2})) >> - 2); - resized_ptr += 3; - } - } - return resized_image; - } else if (resize_type == IMAGE_RESIZE_NEAREST) { - for (int h = 0; h < target_shape[0]; ++h) { - h_origin = h * h_ratio; - int h_real = (int)(round(h_origin)); - for (int w = 0; w < target_shape[1]; ++w) { - w_origin = w * w_ratio; - int w_real = (int)(round(w_origin)); - resized_ptr[0] = image.get_element_value({h_real, w_real, 0}); - resized_ptr[1] = image.get_element_value({h_real, w_real, 1}); - resized_ptr[2] = image.get_element_value({h_real, w_real, 2}); - resized_ptr += 3; - } - } - return resized_image; - } else { - delete resized_image; - ESP_LOGE("resize image", "resize type is not supported!"); - return NULL; - } - } else if (image.shape.back() == 1) { - resized_image->malloc_element(); - uint8_t *resized_ptr = resized_image->element; - float h_origin = 0; - float w_origin = 0; - if (resize_type == IMAGE_RESIZE_BILINEAR) { - for (int h = 0; h < target_shape[0]; ++h) { - h_origin = h * h_ratio; - int h1 = (int)h_origin; - int h2 = h1 + 1; - float h1_weight = (float)h2 - h_origin; - float h2_weight = h_origin - (float)h1; - for (int w = 0; w < target_shape[1]; ++w) { - w_origin = w * w_ratio; - int w1 = (int)w_origin; - int w2 = w1 + 1; - float w1_weight = (float)w2 - w_origin; - float w2_weight = w_origin - (float)w1; - resized_ptr[0] = (uint8_t)(image.get_element_value({h1, w1, 0}) * h1_weight * w1_weight + - image.get_element_value({h1, w2, 0}) * h1_weight * w2_weight + - image.get_element_value({h2, w1, 0}) * h2_weight * w1_weight + - image.get_element_value({h2, w2, 0}) * h2_weight * w2_weight); - ++resized_ptr; - } - } - return resized_image; - } else if (resize_type == IMAGE_RESIZE_MEAN) { - for (int h = 0; h < target_shape[0]; ++h) { - h_origin = h * h_ratio; - int h1 = (int)h_origin; - int h2 = h1 + 1; - for (int w = 0; w < target_shape[1]; ++w) { - w_origin = w * w_ratio; - int w1 = (int)w_origin; - int w2 = w1 + 1; - resized_ptr[0] = (uint8_t)(((int)image.get_element_value({h1, w1, 0}) + - (int)image.get_element_value({h1, w2, 0}) + - (int)image.get_element_value({h2, w1, 0}) + - (int)image.get_element_value({h2, w2, 0})) >> - 2); - ++resized_ptr; - } - } - return resized_image; - } else if (resize_type == IMAGE_RESIZE_NEAREST) { - for (int h = 0; h < target_shape[0]; ++h) { - h_origin = h * h_ratio; - int h_real = (int)(round(h_origin)); - for (int w = 0; w < target_shape[1]; ++w) { - w_origin = w * w_ratio; - int w_real = (int)(round(w_origin)); - resized_ptr[0] = image.get_element_value({h_real, w_real, 0}); - ++resized_ptr; - } - } - return resized_image; - } else { - delete resized_image; - ESP_LOGE("resize image", "resize type is not supported!"); - return NULL; - } - } else { - delete resized_image; - ESP_LOGE("resize image", "the image shape is invaild!"); - return NULL; - } -} - -void resize_image(Tensor &image, Tensor &resized_image, resize_type_t resize_type) -{ - assert(image.shape.size() == 3); - assert(resized_image.shape.size() == 3); - float h_ratio = (float)(image.shape[0]) / resized_image.shape[0]; - float w_ratio = (float)(image.shape[1]) / resized_image.shape[1]; - - if (image.shape.back() == 3) { - resized_image.malloc_element(); - uint8_t *resized_ptr = resized_image.element; - float h_origin = 0; - float w_origin = 0; - if (resize_type == IMAGE_RESIZE_BILINEAR) { - for (int h = 0; h < resized_image.shape[0]; ++h) { - h_origin = h * h_ratio; - int h1 = (int)h_origin; - int h2 = h1 + 1; - float h1_weight = (float)h2 - h_origin; - float h2_weight = h_origin - (float)h1; - for (int w = 0; w < resized_image.shape[1]; ++w) { - w_origin = w * w_ratio; - int w1 = (int)w_origin; - int w2 = w1 + 1; - float w1_weight = (float)w2 - w_origin; - float w2_weight = w_origin - (float)w1; - resized_ptr[0] = (uint8_t)(image.get_element_value({h1, w1, 0}) * h1_weight * w1_weight + - image.get_element_value({h1, w2, 0}) * h1_weight * w2_weight + - image.get_element_value({h2, w1, 0}) * h2_weight * w1_weight + - image.get_element_value({h2, w2, 0}) * h2_weight * w2_weight); - resized_ptr[1] = (uint8_t)(image.get_element_value({h1, w1, 1}) * h1_weight * w1_weight + - image.get_element_value({h1, w2, 1}) * h1_weight * w2_weight + - image.get_element_value({h2, w1, 1}) * h2_weight * w1_weight + - image.get_element_value({h2, w2, 1}) * h2_weight * w2_weight); - resized_ptr[2] = (uint8_t)(image.get_element_value({h1, w1, 2}) * h1_weight * w1_weight + - image.get_element_value({h1, w2, 2}) * h1_weight * w2_weight + - image.get_element_value({h2, w1, 2}) * h2_weight * w1_weight + - image.get_element_value({h2, w2, 2}) * h2_weight * w2_weight); - resized_ptr += 3; - } - } - return; - } else if (resize_type == IMAGE_RESIZE_MEAN) { - for (int h = 0; h < resized_image.shape[0]; ++h) { - h_origin = h * h_ratio; - int h1 = (int)h_origin; - int h2 = h1 + 1; - for (int w = 0; w < resized_image.shape[1]; ++w) { - w_origin = w * w_ratio; - int w1 = (int)w_origin; - int w2 = w1 + 1; - resized_ptr[0] = (uint8_t)(((int)image.get_element_value({h1, w1, 0}) + - (int)image.get_element_value({h1, w2, 0}) + - (int)image.get_element_value({h2, w1, 0}) + - (int)image.get_element_value({h2, w2, 0})) >> - 2); - resized_ptr[1] = (uint8_t)(((int)image.get_element_value({h1, w1, 1}) + - (int)image.get_element_value({h1, w2, 1}) + - (int)image.get_element_value({h2, w1, 1}) + - (int)image.get_element_value({h2, w2, 1})) >> - 2); - resized_ptr[2] = (uint8_t)(((int)image.get_element_value({h1, w1, 2}) + - (int)image.get_element_value({h1, w2, 2}) + - (int)image.get_element_value({h2, w1, 2}) + - (int)image.get_element_value({h2, w2, 2})) >> - 2); - resized_ptr += 3; - } - } - return; - } else if (resize_type == IMAGE_RESIZE_NEAREST) { - for (int h = 0; h < resized_image.shape[0]; ++h) { - h_origin = h * h_ratio; - int h_real = (int)(round(h_origin)); - for (int w = 0; w < resized_image.shape[1]; ++w) { - w_origin = w * w_ratio; - int w_real = (int)(round(w_origin)); - resized_ptr[0] = image.get_element_value({h_real, w_real, 0}); - resized_ptr[1] = image.get_element_value({h_real, w_real, 1}); - resized_ptr[2] = image.get_element_value({h_real, w_real, 2}); - resized_ptr += 3; - } - } - return; - } else { - ESP_LOGE("resize image", "resize type is not supported!"); - return; - } - } else if (image.shape.back() == 1) { - resized_image.malloc_element(); - uint8_t *resized_ptr = resized_image.element; - float h_origin = 0; - float w_origin = 0; - if (resize_type == IMAGE_RESIZE_BILINEAR) { - for (int h = 0; h < resized_image.shape[0]; ++h) { - h_origin = h * h_ratio; - int h1 = (int)h_origin; - int h2 = h1 + 1; - float h1_weight = (float)h2 - h_origin; - float h2_weight = h_origin - (float)h1; - for (int w = 0; w < resized_image.shape[1]; ++w) { - w_origin = w * w_ratio; - int w1 = (int)w_origin; - int w2 = w1 + 1; - float w1_weight = (float)w2 - w_origin; - float w2_weight = w_origin - (float)w1; - resized_ptr[0] = (uint8_t)(image.get_element_value({h1, w1, 0}) * h1_weight * w1_weight + - image.get_element_value({h1, w2, 0}) * h1_weight * w2_weight + - image.get_element_value({h2, w1, 0}) * h2_weight * w1_weight + - image.get_element_value({h2, w2, 0}) * h2_weight * w2_weight); - ++resized_ptr; - } - } - return; - } else if (resize_type == IMAGE_RESIZE_MEAN) { - for (int h = 0; h < resized_image.shape[0]; ++h) { - h_origin = h * h_ratio; - int h1 = (int)h_origin; - int h2 = h1 + 1; - for (int w = 0; w < resized_image.shape[1]; ++w) { - w_origin = w * w_ratio; - int w1 = (int)w_origin; - int w2 = w1 + 1; - resized_ptr[0] = (uint8_t)(((int)image.get_element_value({h1, w1, 0}) + - (int)image.get_element_value({h1, w2, 0}) + - (int)image.get_element_value({h2, w1, 0}) + - (int)image.get_element_value({h2, w2, 0})) >> - 2); - ++resized_ptr; - } - } - return; - } else if (resize_type == IMAGE_RESIZE_NEAREST) { - for (int h = 0; h < resized_image.shape[0]; ++h) { - h_origin = h * h_ratio; - int h_real = (int)(round(h_origin)); - for (int w = 0; w < resized_image.shape[1]; ++w) { - w_origin = w * w_ratio; - int w_real = (int)(round(w_origin)); - resized_ptr[0] = image.get_element_value({h_real, w_real, 0}); - ++resized_ptr; - } - } - return; - } else { - ESP_LOGE("resize image", "resize type is not supported!"); - return; - } - } else { - ESP_LOGE("resize image", "the image shape is invaild!"); - return; - } -} - -template -T *resize_image_nearest(T *image, std::vector input_shape, std::vector target_shape) -{ - assert(input_shape.size() == 3); - assert(target_shape.size() == 3); - T *resized_image = (T *)dl::tool::malloc_aligned( - target_shape[0] * target_shape[1] * target_shape[2], sizeof(T), 16, MALLOC_CAP_8BIT); - float h_ratio = (float)(input_shape[0]) / target_shape[0]; - float w_ratio = (float)(input_shape[1]) / target_shape[1]; - - if (input_shape.back() == 3) { - T *resized_ptr = resized_image; - float h_origin = 0; - float w_origin = 0; - - for (int h = 0; h < target_shape[0]; ++h) { - h_origin = h * h_ratio; - int h_real = (int)(round(h_origin)); - for (int w = 0; w < target_shape[1]; ++w) { - w_origin = w * w_ratio; - int w_real = (int)(round(w_origin)); - T *origin_ptr = image + (h_real * input_shape[1] + w_real * 3); - - resized_ptr[0] = origin_ptr[0]; - resized_ptr[1] = origin_ptr[1]; - resized_ptr[2] = origin_ptr[2]; - resized_ptr += 3; - } - } - return resized_image; - } else if (input_shape.back() == 1) { - T *resized_ptr = resized_image; - float h_origin = 0; - float w_origin = 0; - - for (int h = 0; h < target_shape[0]; ++h) { - h_origin = h * h_ratio; - int h_real = (int)(round(h_origin)); - for (int w = 0; w < target_shape[1]; ++w) { - w_origin = w * w_ratio; - int w_real = (int)(round(w_origin)); - resized_ptr[0] = *(image + (h_real * input_shape[1] + w_real)); - ++resized_ptr; - } - } - return resized_image; - } else { - dl::tool::free_aligned(resized_image); - ESP_LOGE("resize image", "the image shape is invaild!"); - return NULL; - } -} -template int32_t *resize_image_nearest(int32_t *image, std::vector input_shape, std::vector target_shape); -template uint32_t *resize_image_nearest(uint32_t *image, std::vector input_shape, std::vector target_shape); -template int16_t *resize_image_nearest(int16_t *image, std::vector input_shape, std::vector target_shape); -template uint16_t *resize_image_nearest(uint16_t *image, std::vector input_shape, std::vector target_shape); -template int8_t *resize_image_nearest(int8_t *image, std::vector input_shape, std::vector target_shape); -template uint8_t *resize_image_nearest(uint8_t *image, std::vector input_shape, std::vector target_shape); -template float *resize_image_nearest(float *image, std::vector input_shape, std::vector target_shape); - -template -void resize_image_nearest(T *image, std::vector input_shape, T *resized_image, std::vector target_shape) -{ - assert(input_shape.size() == 3); - assert(target_shape.size() == 3); - float h_ratio = (float)(input_shape[0]) / target_shape[0]; - float w_ratio = (float)(input_shape[1]) / target_shape[1]; - - if (input_shape.back() == 3) { - T *resized_ptr = resized_image; - float h_origin = 0; - float w_origin = 0; - - for (int h = 0; h < target_shape[0]; ++h) { - h_origin = h * h_ratio; - int h_real = (int)(round(h_origin)); - for (int w = 0; w < target_shape[1]; ++w) { - w_origin = w * w_ratio; - int w_real = (int)(round(w_origin)); - T *origin_ptr = image + (h_real * input_shape[1] + w_real * 3); - - resized_ptr[0] = origin_ptr[0]; - resized_ptr[1] = origin_ptr[1]; - resized_ptr[2] = origin_ptr[2]; - resized_ptr += 3; - } - } - return; - } else if (input_shape.back() == 1) { - T *resized_ptr = resized_image; - float h_origin = 0; - float w_origin = 0; - - for (int h = 0; h < target_shape[0]; ++h) { - h_origin = h * h_ratio; - int h_real = (int)(round(h_origin)); - for (int w = 0; w < target_shape[1]; ++w) { - w_origin = w * w_ratio; - int w_real = (int)(round(w_origin)); - resized_ptr[0] = *(image + (h_real * input_shape[1] + w_real)); - ++resized_ptr; - } - } - return; - } else { - ESP_LOGE("resize image", "the image shape is invaild!"); - return; - } -} -template void resize_image_nearest(int32_t *image, - std::vector input_shape, - int32_t *resized_image, - std::vector target_shape); -template void resize_image_nearest(uint32_t *image, - std::vector input_shape, - uint32_t *resized_image, - std::vector target_shape); -template void resize_image_nearest(int16_t *image, - std::vector input_shape, - int16_t *resized_image, - std::vector target_shape); -template void resize_image_nearest(uint16_t *image, - std::vector input_shape, - uint16_t *resized_image, - std::vector target_shape); -template void resize_image_nearest(int8_t *image, - std::vector input_shape, - int8_t *resized_image, - std::vector target_shape); -template void resize_image_nearest(uint8_t *image, - std::vector input_shape, - uint8_t *resized_image, - std::vector target_shape); -template void resize_image_nearest(float *image, - std::vector input_shape, - float *resized_image, - std::vector target_shape); - -} // namespace image -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/image/dl_image.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/image/dl_image.hpp deleted file mode 100644 index 6128e96f..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/image/dl_image.hpp +++ /dev/null @@ -1,563 +0,0 @@ -#pragma once - -#include "dl_define.hpp" -#include "dl_math_matrix.hpp" -#include "dl_variable.hpp" -#include -#include -#include -#include - -namespace dl { -namespace image { -typedef enum { - IMAGE_RESIZE_BILINEAR = 0, /*> 7; - return DL_CLIP(temp, 0, 255); -} - -/** - * @brief Convert RGB565 pixel to RGB888. - * - * @tparam T supports all integer types - * @param input pixel value in RGB565 - * @param output pixel value in RGB888 - * @param byte_swap Whether to swap the input data in byte. - */ -template -inline void convert_pixel_rgb565_to_rgb888(uint16_t input, T *output, bool byte_swap = false) -{ - // gggbbbbb rrrrrggg - if (byte_swap) { - output[0] = (input & 0x1F00) >> 5; // blue - output[1] = ((input & 0x7) << 5) | ((input & 0xE000) >> 11); // green - output[2] = input & 0xF8; // red - } - // rrrrrggg gggbbbbb - else { - output[0] = input & 0x1F; // blue - output[1] = (input & 0x7E0) >> 3; // green - output[2] = (input & 0xF800) >> 8; // red - } -} - -/** - * @brief Convert RGB565 image to RGB888 image. - * - * @param image ptr of RGB565 image - * @param image_shape shape of the input image - * @return Tensor* output RGB88 image - */ -Tensor *convert_image_rgb565_to_rgb888(uint16_t *image, std::vector &image_shape); - -/** - * @brief Convert RGB565 pixel to Gray. - * - * @param input pixel value in RGB565 - * @param byte_swap Whether to swap the input data in byte. - * @return pixel value in Gray - */ -inline uint8_t convert_pixel_rgb565_to_gray(uint16_t input, bool byte_swap = false) -{ - int blue, green, red; - if (byte_swap) { - blue = (input & 0x1F00) >> 5; // blue - green = ((input & 0x7) << 5) | ((input & 0xE000) >> 11); // green - red = input & 0xF8; // red - } else { - blue = input & 0x1F; // blue - green = (input & 0x7E0) >> 3; // green - red = (input & 0xF800) >> 8; // red - } - return convert_pixel_rgb888_to_gray(red, green, blue); -} - -/** - * @brief Crop a patch from image and resize and store to destination image. - * If the cropping box is out of image, destination image will be padded with edge. - * - * The outer rectangle is the entire output image. - * The inner rectangle is where the resized image will be stored. - * In other world, this function could help you do padding while resize image. - * ___________________________(dst_w)__________________ - * | ___________________________ | - * | |(x_start, y_start) | | - * | | | | - * | | | | - * (dst_h)| | | | - * | | | | - * | | | | - * | |___________________________|(x_end, y_end) | - * |____________________________________________________| - * - * @tparam T suppot all integer types - * @param dst_image pointer of destination(output) image - * @param dst_width destination image width - * @param dst_channel destination image channel number - * @param dst_y_start start y of resized image in destination image - * @param dst_y_end end y of resized image in destination image - * @param dst_x_start start x of resized image in destination image - * @param dst_x_end end x of resized image in destination image - * @param src_image pointer of source image - * @param src_height source image height - * @param src_width source image width - * @param src_channel source image channel - * @param src_y_start start y of resized image in source image - * @param src_y_end end y of resized image in source image - * @param src_x_start start x of resized image in source image - * @param src_x_end end x of resized image in source image - * @param resize_type one of IMAGE_RESIZE_BILINEAR or IMAGE_RESIZE_MEAN or IMAGE_RESIZE_NEAREST - * @param rgb_swap Whether to swap the input data in RGB (e.g. RGB->BGR, BGR->RGB) - * @param byte_swap Whether to swap the input data in byte. - */ -template -void crop_and_resize(T *dst_image, - int dst_width, - int dst_channel, - int dst_y_start, - int dst_y_end, - int dst_x_start, - int dst_x_end, - uint16_t *src_image, - int src_height, - int src_width, - int src_channel, - int src_y_start, - int src_y_end, - int src_x_start, - int src_x_end, - resize_type_t resize_type = IMAGE_RESIZE_NEAREST, - bool rgb_swap = false, - bool byte_swap = false); - -/** - * @brief Crop a patch from image and resize and store to destination image. - * If the cropping box is out of image, destination image will be padded with edge. - * - * The outer rectangle is the entire output image. - * The inner rectangle is where the resized image will be stored. - * In other world, this function could help you do padding while resize image. - * ___________________________(dst_w)__________________ - * | ___________________________ | - * | |(x_start, y_start) | | - * | | | | - * | | | | - * (dst_h)| | | | - * | | | | - * | | | | - * | |___________________________|(x_end, y_end) | - * |____________________________________________________| - * - * @tparam T suppot all integer types - * @param dst_image pointer of destination(output) image - * @param dst_width destination image width - * @param dst_channel destination image channel number - * @param dst_y_start start y of resized image in destination image - * @param dst_y_end end y of resized image in destination image - * @param dst_x_start start x of resized image in destination image - * @param dst_x_end end x of resized image in destination image - * @param src_image pointer of source image - * @param src_height source image height - * @param src_width source image width - * @param src_channel source image channel - * @param src_y_start start y of resized image in source image - * @param src_y_end end y of resized image in source image - * @param src_x_start start x of resized image in source image - * @param src_x_end end x of resized image in source image - * @param resize_type one of IMAGE_RESIZE_BILINEAR or IMAGE_RESIZE_MEAN or IMAGE_RESIZE_NEAREST - * @param rgb_swap Whether to swap the input data in RGB (e.g. RGB->BGR, BGR->RGB) - */ -template -void crop_and_resize(T *dst_image, - int dst_width, - int dst_channel, - int dst_y_start, - int dst_y_end, - int dst_x_start, - int dst_x_end, - uint8_t *src_image, - int src_height, - int src_width, - int src_channel, - int src_y_start, - int src_y_end, - int src_x_start, - int src_x_end, - resize_type_t resize_type = IMAGE_RESIZE_NEAREST, - bool rgb_swap = false); - -/** - * @brief Draw a filled rectangle on RGB888 image. - * - * @param image pointer of input image - * @param image_height height of input image - * @param image_width width of input image - * @param x1 left up corner x - * @param y1 left up corner y - * @param x2 right bottom corner x - * @param y2 right bottom corner y - * @param color 0x 00| 00| 00| 00 - * reserved|channel 0|channel 1|channel 2 - */ -void draw_filled_rectangle(uint8_t *image, - const uint32_t image_height, - const uint32_t image_width, - uint32_t x1, - uint32_t y1, - uint32_t x2, - uint32_t y2, - const uint32_t color = 0x00FF0000); - -/** - * @brief Draw a filled rectangle on RGB565 image. - * - * @param image pointer of input image - * @param image_height height of input image - * @param image_width width of input image - * @param x1 left up corner x - * @param y1 left up corner y - * @param x2 right bottom corner x - * @param y2 right bottom corner y - * @param color 0b 000| 00000| 00000| 000 - * channel 1[2:0]|channel 0|channel 2|channel 1[5:3] - */ -void draw_filled_rectangle(uint16_t *image, - const uint32_t image_height, - const uint32_t image_width, - uint32_t x1, - uint32_t y1, - uint32_t x2, - uint32_t y2, - const uint16_t color = 0b0001111100000000); - -/** - * @brief Draw a point on RGB888 image. - * - * @param image pointer of input image - * @param image_height height of input image - * @param image_width width of input image - * @param x point x - * @param y point y - * @param size size of point - * @param color 0x 00| 00| 00| 00 - * reserved|channel 0|channel 1|channel 2 - */ -void draw_point(uint8_t *image, - const uint32_t image_height, - const uint32_t image_width, - const uint32_t x, - const uint32_t y, - const uint32_t size, - const uint32_t color = 0x00FF0000); - -/** - * @brief Draw a point on RGB565 image. - * - * @param image pointer of input image - * @param image_height height of input image - * @param image_width width of input image - * @param x point x - * @param y point y - * @param size size of point - * @param color 0b 000| 00000| 00000| 000 - * channel 1[2:0]|channel 0|channel 2|channel 1[5:3] - */ -void draw_point(uint16_t *image, - const uint32_t image_height, - const uint32_t image_width, - const uint32_t x, - const uint32_t y, - const uint32_t size, - uint16_t color = 0b0001111100000000); - -/** - * @brief Draw a hollow rectangle on RGB888 image. - * - * @param image pointer of input image - * @param image_height height of input image - * @param image_width width of input image - * @param x1 left up corner x - * @param y1 left up corner y - * @param x2 right bottom corner x - * @param y2 right bottom corner y - * @param color 0x 00| 00| 00| 00 - * reserved|channel 0|channel 1|channel 2 - */ -void draw_hollow_rectangle(uint8_t *image, - const uint32_t image_height, - const uint32_t image_width, - uint32_t x1, - uint32_t y1, - uint32_t x2, - uint32_t y2, - uint32_t color = 0x00FF0000); - -/** - * @brief Draw a hollow rectangle on RGB565 image. - * - * @param image pointer of input image - * @param image_height height of input image - * @param image_width width of input image - * @param x1 left up corner x - * @param y1 left up corner y - * @param x2 right bottom corner x - * @param y2 right bottom corner y - * @param color 0b 000| 00000| 00000| 000 - * channel 1[2:0]|channel 0|channel 2|channel 1[5:3] - */ -void draw_hollow_rectangle(uint16_t *image, - const uint32_t image_height, - const uint32_t image_width, - uint32_t x1, - uint32_t y1, - uint32_t x2, - uint32_t y2, - const uint16_t color = 0b0001111100000000); - -/** - * @brief Detect target moving by activated detection point number. Each cross in the figure below is a detection point. - * Once abs(frame_1_detection_point[i] - frame_2_detection_point[i]) > threshold, this detection point is activated. - * This function will return the number of activated detection point. - * - * __stride__________________________ - * | | | | | - * stride | | | | | - * | | | | | - * |________|________|________| | - * | | | | | - * | | | | | - * | | | | | - * |________|________|________| height - * | | | | | - * | | | | | - * | | | | | - * |________|________|________| | - * | | | | | - * | | | | | - * | | | | | - * |________|________|________|___|___ - * | | - * |__________width___________| - * | | - * - * Time consumption: - * Frame shape = (240, 240) - * Both frame are in PSRAM - * On ESP32-S3 with CPU 240MHz, QSPI 80MHz - * - * stride latency - * 1 28316us - * 2 8770us - * 4 3622us - * 8 1990us - * 16 880us - * 32 260us - * - * - * In a application, outside this function, threshold of activated detection point number is needed. - * Once activated detection point number > number_threshold, this two frame are judged target moved. - * How to determine the number_threshold? - * Let's assume that the minimize shape of target is (target_min_height, target_max_width). - * Then, the number_threshold = [target_min_height / stride] * [target_max_width / stride] * ratio, - * where ratio is in (0, 1), the smaller the ratio is, the more sensitive the detector is, the more false detected. - * - * - * @param f1 one frame in RGB565 - * @param f2 another frame in RGB565 - * @param height height of frame - * @param width width of frame - * @param stride stride of detection point, the smaller the stride is, the more reliable the detector is. - * @param threshold activation threshold of each detection point - * @return activated detection point number - */ -uint32_t get_moving_point_number(uint16_t *f1, - uint16_t *f2, - const uint32_t height, - const uint32_t width, - const uint32_t stride, - const uint32_t threshold = 5); - -/** - * @brief Detect target moving by activated detection point number. Each cross in the figure below is a detection point. - * Once abs(frame_1_detection_point[i] - frame_2_detection_point[i]) > threshold, this detection point is activated. - * This function will return the number of activated detection point. - * - * __stride__________________________ - * | | | | | - * stride | | | | | - * | | | | | - * |________|________|________| | - * | | | | | - * | | | | | - * | | | | | - * |________|________|________| height - * | | | | | - * | | | | | - * | | | | | - * |________|________|________| | - * | | | | | - * | | | | | - * | | | | | - * |________|________|________|___|___ - * | | - * |__________width___________| - * | | - * - * - * In a application, outside this function, threshold of activated detection point number is needed. - * Once activated detection point number > number_threshold, this two frame are judged target moved. - * How to determine the number_threshold? - * Let's assume that the minimize shape of target is (target_min_height, target_max_width). - * Then, the number_threshold = [target_min_height / stride] * [target_max_width / stride] * ratio, - * where ratio is in (0, 1), the smaller the ratio is, the more sensitive the detector is, the more false detected. - * - * - * @param f1 one frame in RGB888 - * @param f2 another frame in RGB888 - * @param height height of frame - * @param width width of frame - * @param stride stride of detection point, the smaller the stride is, the more reliable the detector is. - * @param threshold activation threshold of each detection point - * @return activated detection point number - */ -uint32_t get_moving_point_number(uint8_t *f1, - uint8_t *f2, - const uint32_t height, - const uint32_t width, - const uint32_t stride, - const uint32_t threshold = 5); - -/** - * @brief Apply an affine transformation to an image. - * - * @tparam T - * @param input the input image. - * @param output the output image. - * @param M_inv the inverse transformation matrix. - */ -template -void warp_affine(uint8_t *input, - const std::vector &input_shape, - T *output, - const std::vector &output_shape, - dl::math::Matrix *M_inv, - bool byte_swap = false); - -/** - * @brief Apply an affine transformation to an image. - * - * @tparam T - * @param input the pointer of the input image. - * @param shape the shape of the input image. - * @param output the output image. - * @param M_inv the inverse transformation matrix. - */ -template -void warp_affine(uint16_t *input, - const std::vector &input_shape, - T *output, - const std::vector &output_shape, - dl::math::Matrix *M_inv, - bool byte_swap = false); - -/** - * @brief Get the otsu thresh object. - * - * @param image the gray image. - * @return uint8_t the otsu thresh. - */ -uint8_t get_otsu_thresh(Tensor &image); - -/** - * @brief Convert RGB image to gray image - * - * @param image input image - * @param bgr true: the image is in BGR format - * false: the image is in RGB format - * @return Tensor* output image in gray format - */ -Tensor *rgb2gray(Tensor &image, bool bgr = false); - -/** - * @brief Convert RGB image to LAB image - * - * @param image input image - * @param bgr true: the image is in BGR format - * false: the image is in RGB format - * @param fast true: use the fast alogrithm, but the accuracy will be reduced - * false: do not use the fast alogrithm - * @return Tensor* output image in LAB foramt - */ -Tensor *rgb2lab(Tensor &image, bool bgr = false, bool fast = true); - -/** - * @brief Convert RGB image to HSV image - * - * @param image input image - * @param bgr true: the image is in BGR format - * false: the image is in RGB format - * @param fast true: use the fast alogrithm, but the accuracy will be reduced - * false: do not use the fast alogrithm - * @return Tensor* output image in HSV format - */ -Tensor *rgb2hsv(Tensor &image, bool bgr = false, bool fast = true); - -/** - * @brief resize an image to the target shape. - * - * @param image the input image Tensor - * @param target_shape the target shape of the resized image. - * @param resize_type one of IMAGE_RESIZE_BILINEAR or IMAGE_RESIZE_MEAN or IMAGE_RESIZE_NEAREST - * @return Tensor* the pointer of the resized image Tensor - */ -Tensor *resize_image(Tensor &image, std::vector target_shape, resize_type_t resize_type); - -/** - * @brief resize an image to the target shape. - * - * @param image the input image Tensor - * @param resized_image the resized image Tensor - * @param resize_type one of IMAGE_RESIZE_BILINEAR or IMAGE_RESIZE_MEAN or IMAGE_RESIZE_NEAREST - */ -void resize_image(Tensor &image, Tensor &resized_image, resize_type_t resize_type); - -/** - * @brief resize an image to the target shape with nearest method. - * - * @tparam T - * @param image the pointer of the input image - * @param input_shape the input shape of the image - * @param target_shape the target shape of the resized image - * @return T* the pointer of the resized image - */ -template -T *resize_image_nearest(T *image, std::vector input_shape, std::vector target_shape); - -/** - * @brief resize an image to the target shape with nearest method. - * - * @tparam T - * @param image the pointer of the input image - * @param input_shape the input shape of the image - * @param resized_image the pointer of the resized image - * @param target_shape the target shape of the resized image - */ -template -void resize_image_nearest(T *image, std::vector input_shape, T *resized_image, std::vector target_shape); - -} // namespace image -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/image/dl_image_preprocessor.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/image/dl_image_preprocessor.cpp deleted file mode 100644 index a0b9dae5..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/image/dl_image_preprocessor.cpp +++ /dev/null @@ -1,228 +0,0 @@ -#include "dl_image_preprocessor.hpp" - -#define ALIGN_UP(num, align) (((num) + ((align) - 1)) & ~((align) - 1)) - -namespace dl { -namespace image { -template -ImagePreprocessor::ImagePreprocessor(TensorBase *model_input, - const std::vector &mean, - const std::vector &std, - bool rgb_swap, - bool byte_swap, - bool use_ppa) : - model_input(model_input), mean(mean), std(std), rgb_swap(rgb_swap), byte_swap(byte_swap), use_ppa(use_ppa) - -{ - this->norm_lut = (feature_t *)tool::malloc_aligned( - mean.size() * 256, sizeof(feature_t), 16, MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM); - this->create_norm_lut(); -#if CONFIG_IDF_TARGET_ESP32P4 - if (this->use_ppa) { - memset(&this->ppa_client_config, 0, sizeof(ppa_client_config_t)); - this->ppa_client_config.oper_type = PPA_OPERATION_SRM; - ESP_ERROR_CHECK(ppa_register_client(&this->ppa_client_config, &this->ppa_client_srm_handle)); - memset(&this->srm_oper_config, 0, sizeof(ppa_srm_oper_config_t)); - size_t cache_line_size; - ESP_ERROR_CHECK(esp_cache_get_alignment(MALLOC_CAP_SPIRAM | MALLOC_CAP_DMA, &cache_line_size)); - this->ppa_buffer_size = - ALIGN_UP(model_input->shape[1] * model_input->shape[2] * model_input->shape[3], cache_line_size); - this->ppa_buffer = tool::calloc_aligned( - this->ppa_buffer_size, sizeof(uint8_t), cache_line_size, MALLOC_CAP_SPIRAM | MALLOC_CAP_DMA); - } -#endif -} - -template -ImagePreprocessor::~ImagePreprocessor() -{ - if (this->norm_lut) { - heap_caps_free(this->norm_lut); - this->norm_lut = nullptr; - } -#if CONFIG_IDF_TARGET_ESP32P4 - if (this->use_ppa) { - if (this->ppa_buffer) { - heap_caps_free(this->ppa_buffer); - this->ppa_buffer = nullptr; - } - if (this->ppa_client_srm_handle) { - ESP_ERROR_CHECK(ppa_unregister_client(this->ppa_client_srm_handle)); - this->ppa_client_srm_handle = nullptr; - } - } -#endif -} - -template -void ImagePreprocessor::create_norm_lut() -{ - // TODO support s3 round - for (int i = 0; i < this->mean.size(); i++) { - if (std::is_same::value) { - for (int j = 0; j < 256; j++) { - this->norm_lut[i * 256 + j] = (feature_t)DL_CLIP( - tool::round(((float)j - this->mean[i]) / this->std[i] / DL_SCALE(this->model_input->exponent)), - -128, - 127); - } - } else { - for (int j = 0; j < 65536; j++) { - this->norm_lut[i * 65536 + j] = (feature_t)DL_CLIP( - tool::round(((float)j - this->mean[i]) / this->std[i] / DL_SCALE(this->model_input->exponent)), - -32768, - 32767); - } - } - } -} - -template -template -void ImagePreprocessor::preprocess(T *input_element, - const std::vector &input_shape, - const std::vector &crop_area) -{ - dl::tool::Latency latency; - // step1. crop & resize - if (!crop_area.empty() || input_shape[0] != this->model_input->shape[1] || - input_shape[1] != this->model_input->shape[2]) { - latency.start(); - if (!crop_area.empty()) { - assert(crop_area.size() == 4); - input_area_x_start = crop_area[0]; - input_area_y_start = crop_area[1]; - input_area_x_end = crop_area[2]; - input_area_y_end = crop_area[3]; - } else { - input_area_x_start = 0; - input_area_y_start = 0; - input_area_x_end = input_shape[1]; - input_area_y_end = input_shape[0]; - } - this->resize_scale_y = (float)this->model_input->shape[1] / (input_area_y_end - input_area_y_start); - this->resize_scale_x = (float)this->model_input->shape[2] / ((input_area_x_end - input_area_x_start)); - -#if CONFIG_IDF_TARGET_ESP32P4 - // hardware resize with ppa - // only esp32p4 has ppa, - // ppa use 8 bit to store int part of scale, 4 bit to store frac part of scale. - bool ppa_available = false; - if (this->use_ppa && this->resize_scale_y < 256 && this->resize_scale_x < 256 && - this->resize_scale_y >= 0.0625 && this->resize_scale_x >= 0.0625) { - int resize_scale_y_int = floor(this->resize_scale_y); - int resize_scale_x_int = floor(this->resize_scale_x); - float resize_scale_y_frac = this->resize_scale_y - resize_scale_y_int; - float resize_scale_x_frac = this->resize_scale_x - resize_scale_x_int; - resize_scale_y_frac = floor(resize_scale_y_frac / 0.0625) * 0.0625; - resize_scale_x_frac = floor(resize_scale_x_frac / 0.0625) * 0.0625; - float new_resize_scale_y = resize_scale_y_int + resize_scale_y_frac; - float new_resize_scale_x = resize_scale_x_int + resize_scale_x_frac; - float error_percentage_y = - (this->model_input->shape[1] - new_resize_scale_y * (input_area_y_end - input_area_y_start)) / - this->model_input->shape[0]; - float error_percentage_x = - (this->model_input->shape[2] - new_resize_scale_x * (input_area_x_end - input_area_x_start)) / - this->model_input->shape[1]; - - if (error_percentage_x < 0.3 && error_percentage_y < 0.3) { - ppa_available = true; - this->resize_scale_y = new_resize_scale_y; - this->resize_scale_x = new_resize_scale_x; - } - } - - if (ppa_available) { - srm_oper_config.in.buffer = (const void *)input_element; - srm_oper_config.in.pic_h = input_shape[0]; - srm_oper_config.in.pic_w = input_shape[1]; - srm_oper_config.in.block_h = input_area_y_end - input_area_y_start; - srm_oper_config.in.block_w = input_area_x_end - input_area_x_start; - srm_oper_config.in.block_offset_y = input_area_y_start; - srm_oper_config.in.block_offset_x = input_area_x_start; - if (std::is_same::value) - srm_oper_config.in.srm_cm = PPA_SRM_COLOR_MODE_RGB888; - else - srm_oper_config.in.srm_cm = PPA_SRM_COLOR_MODE_RGB565; - - srm_oper_config.out.buffer = this->ppa_buffer; - srm_oper_config.out.buffer_size = this->ppa_buffer_size; - srm_oper_config.out.pic_h = model_input->shape[1]; - srm_oper_config.out.pic_w = model_input->shape[2]; - srm_oper_config.out.block_offset_x = 0; - srm_oper_config.out.block_offset_y = 0; - srm_oper_config.out.srm_cm = PPA_SRM_COLOR_MODE_RGB888; - - srm_oper_config.rotation_angle = PPA_SRM_ROTATION_ANGLE_0; - srm_oper_config.scale_x = this->resize_scale_x; - srm_oper_config.scale_y = this->resize_scale_y; - srm_oper_config.mirror_x = false; - srm_oper_config.mirror_y = false; - srm_oper_config.rgb_swap = this->rgb_swap; - - srm_oper_config.mode = PPA_TRANS_MODE_BLOCKING; - ESP_ERROR_CHECK(ppa_do_scale_rotate_mirror(this->ppa_client_srm_handle, &srm_oper_config)); - tool::copy_memory(this->model_input->data, this->ppa_buffer, this->model_input->get_bytes()); - } else -#endif - { - // software resize - crop_and_resize((uint8_t *)this->model_input->get_element_ptr(), - this->model_input->shape[2], - this->model_input->shape[3], - 0, - this->model_input->shape[1], - 0, - this->model_input->shape[2], - input_element, - input_shape[0], - input_shape[1], - input_shape[2], - input_area_y_start, - input_area_y_end, - input_area_x_start, - input_area_x_end, - IMAGE_RESIZE_NEAREST, - this->rgb_swap); - } - latency.end(); - latency.print("image_preprocess", "resize"); - } else { - latency.start(); - this->resize_scale_y = 1; - this->resize_scale_x = 1; - if (this->model_input->get_element_ptr() != input_element) - tool::copy_memory(this->model_input->get_element_ptr(), input_element, this->model_input->get_bytes()); - latency.end(); - latency.print("image_preprocess", "copy"); - } - - // step2. normalize quantize - // TODO add int16_t support - uint8_t *input = (uint8_t *)this->model_input->get_element_ptr(); - feature_t *norm_quant_input = (feature_t *)this->model_input->get_element_ptr(); - latency.start(); - for (int i = 0; i < this->model_input->get_size(); i++) { - norm_quant_input[i] = this->norm_lut[i % 3 * 256 + input[i]]; - } - latency.end(); - latency.print("image_preprocess", "normalize"); -} - -template void ImagePreprocessor::preprocess(uint8_t *input_element, - const std::vector &input_shape, - const std::vector &crop_area); -template void ImagePreprocessor::preprocess(uint16_t *input_element, - const std::vector &input_shape, - const std::vector &crop_area); -template void ImagePreprocessor::preprocess(uint8_t *input_element, - const std::vector &input_shape, - const std::vector &crop_area); -template void ImagePreprocessor::preprocess(uint16_t *input_element, - const std::vector &input_shape, - const std::vector &crop_area); - -template class ImagePreprocessor; -template class ImagePreprocessor; -} // namespace image -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/image/dl_image_preprocessor.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/image/dl_image_preprocessor.hpp deleted file mode 100644 index 0e8236a3..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/image/dl_image_preprocessor.hpp +++ /dev/null @@ -1,72 +0,0 @@ -#pragma once - -#include "cmath" -#include "dl_image.hpp" -#include "stdint.h" -// #include "dl_detect_define.hpp" -#include "esp_cache.h" -#include "driver/ppa.h" -#include "esp_private/esp_cache_private.h" - -namespace dl { -namespace image { -/** - * @brief rgb565->rgb888, crop, resize, normalize, quantize - * NOTE: input should be (h, w, 3) or (h, w, 1) with value range in [0, 255]. - * - * @tparam feature_t supports int16_t and int8_t, - * - int16_t: stands for operation in int16_t quantize - * - int8_t: stands for operation in int8_t quantize - */ -template -class ImagePreprocessor { -public: - TensorBase *model_input; - -private: - const std::vector mean; - const std::vector std; - bool rgb_swap; - bool byte_swap; - bool use_ppa; - feature_t *norm_lut; - int input_area_x_start; - int input_area_y_start; - int input_area_x_end; - int input_area_y_end; - float resize_scale_x; - float resize_scale_y; -#if CONFIG_IDF_TARGET_ESP32P4 - ppa_client_handle_t ppa_client_srm_handle; - ppa_client_config_t ppa_client_config; - ppa_srm_oper_config_t srm_oper_config; - size_t ppa_buffer_size; - void *ppa_buffer; -#endif - void create_norm_lut(); - -public: - ImagePreprocessor(TensorBase *model_input, - const std::vector &mean, - const std::vector &std, - bool byte_rgb = false, -#if CONFIG_IDF_TARGET_ESP32S3 - bool byte_swap = true, -#else - bool byte_swap = false, -#endif - bool use_ppa = false); - - ~ImagePreprocessor(); - - float get_resize_scale_x() { return this->resize_scale_x; }; - float get_resize_scale_y() { return this->resize_scale_y; }; - float get_top_left_x() { return this->input_area_x_start; }; - float get_top_left_y() { return this->input_area_y_start; }; - - template - void preprocess(T *input_element, const std::vector &input_shape, const std::vector &crop_area = {}); -}; - -} // namespace image -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_database_base.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_database_base.cpp deleted file mode 100644 index 05e0d11b..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_database_base.cpp +++ /dev/null @@ -1,294 +0,0 @@ -#include "dl_recognition_database_base.hpp" - -static const char *TAG = "dl::recognition::DataBase"; - -namespace dl { -namespace recognition { -void DataBase::init(int feat_len) -{ - ESP_ERROR_CHECK(this->mount()); - FILE *f = fopen(this->db_path, "rb"); - if (f == NULL) { - this->create_empty_database_in_storage(feat_len); - } else { - this->load_database_from_storage(feat_len); - } -} - -void DataBase::deinit() -{ - this->clear_all_feats_in_memory(); - ESP_ERROR_CHECK(this->unmount()); -} - -esp_err_t DataBase::create_empty_database_in_storage(int feat_len) -{ - FILE *f = fopen(this->db_path, "wb"); - size_t size = 0; - if (f) { - this->meta.num_feats_total = 0; - this->meta.num_feats_valid = 0; - this->meta.feat_len = feat_len; - size = fwrite(&this->meta, sizeof(database_meta), 1, f); - if (size != 1) { - ESP_LOGE(TAG, "Failed to write db meta data."); - return ESP_FAIL; - } - fclose(f); - return ESP_OK; - } else { - ESP_LOGE(TAG, "Failed to open db."); - return ESP_FAIL; - } -} - -esp_err_t DataBase::clear_all_feats() -{ - if (remove(this->db_path) == -1) { - ESP_LOGE(TAG, "Failed to remove db."); - return ESP_FAIL; - } - ESP_RETURN_ON_ERROR( - this->create_empty_database_in_storage(this->meta.feat_len), TAG, "Failed to create empty db in storage."); - this->clear_all_feats_in_memory(); - return ESP_OK; -} - -void DataBase::clear_all_feats_in_memory() -{ - for (auto it = this->feats.begin(); it != this->feats.end(); it++) { - heap_caps_free(it->feat); - } - this->feats.clear(); - this->meta.num_feats_total = 0; - this->meta.num_feats_valid = 0; -} - -esp_err_t DataBase::load_database_from_storage(int feat_len) -{ - this->clear_all_feats_in_memory(); - FILE *f = fopen(this->db_path, "rb"); - size_t size = 0; - if (f) { - size = fread(&this->meta, sizeof(database_meta), 1, f); - if (size != 1) { - ESP_LOGE(TAG, "Failed to read database meta."); - fclose(f); - return ESP_FAIL; - } - if (feat_len != this->meta.feat_len) { - ESP_LOGE(TAG, "Feature len in storage does not match feature len in db."); - fclose(f); - return ESP_FAIL; - } - uint16_t id; - for (int i = 0; i < this->meta.num_feats_total; i++) { - size = fread(&id, sizeof(uint16_t), 1, f); - if (size != 1) { - ESP_LOGE(TAG, "Failed to read feature id."); - fclose(f); - return ESP_FAIL; - } - if (id == 0) { - if (fseek(f, sizeof(float) * this->meta.feat_len, SEEK_CUR) == -1) { - ESP_LOGE(TAG, "Failed to seek db file."); - fclose(f); - return ESP_FAIL; - } - continue; - } - float *feat = (float *)heap_caps_malloc(sizeof(float) * this->meta.feat_len, MALLOC_CAP_SPIRAM); - size = fread(feat, sizeof(float), this->meta.feat_len, f); - if (size != this->meta.feat_len) { - ESP_LOGE(TAG, "Failed to read feature data."); - fclose(f); - return ESP_FAIL; - } - this->feats.emplace_back(id, feat); - } - if (this->feats.size() != this->meta.num_feats_valid) { - ESP_LOGE(TAG, "Incorrect valid feature num."); - fclose(f); - return ESP_FAIL; - } - fclose(f); - } else { - ESP_LOGE(TAG, "Failed to open db."); - return ESP_FAIL; - } - return ESP_OK; -} - -esp_err_t DataBase::enroll_feat(TensorBase *feat) -{ - ESP_RETURN_ON_ERROR(this->check_enough_free_space(), TAG, "No more space in storage."); - if (feat->dtype != DATA_TYPE_FLOAT) { - ESP_LOGE(TAG, "Only support float feature."); - return ESP_FAIL; - } - if (feat->size != this->meta.feat_len) { - ESP_LOGE(TAG, "Feature len to enroll does not match feature len in db."); - return ESP_FAIL; - } - float *feat_copy = (float *)heap_caps_malloc(sizeof(float) * this->meta.feat_len, MALLOC_CAP_SPIRAM); - memcpy(feat_copy, feat->data, feat->get_bytes()); - - this->feats.emplace_back(this->meta.num_feats_total + 1, feat_copy); - this->meta.num_feats_total++; - this->meta.num_feats_valid++; - - size_t size = 0; - FILE *f = fopen(this->db_path, "rb+"); - if (f) { - size = fwrite(&this->meta, sizeof(database_meta), 1, f); - if (size != 1) { - ESP_LOGE(TAG, "Failed to write database meta."); - fclose(f); - return ESP_FAIL; - } - if (fseek(f, 0, SEEK_END) == 0) { - size = fwrite(&this->feats.back().id, sizeof(uint16_t), 1, f); - if (size != 1) { - ESP_LOGE(TAG, "Failed to write feature id."); - fclose(f); - return ESP_FAIL; - } - size = fwrite(this->feats.back().feat, sizeof(float), this->meta.feat_len, f); - if (size != this->meta.feat_len) { - ESP_LOGE(TAG, "Failed to write feature."); - fclose(f); - return ESP_FAIL; - } - } else { - ESP_LOGE(TAG, "Failed to seek db file."); - fclose(f); - return ESP_FAIL; - } - } else { - ESP_LOGE(TAG, "Failed to open db."); - fclose(f); - return ESP_FAIL; - } - fclose(f); - return ESP_OK; -} - -esp_err_t DataBase::delete_feat(uint16_t id) -{ - bool invalid_id = true; - for (auto it = this->feats.begin(); it != this->feats.end(); it++) { - if (it->id != id) { - continue; - } else { - heap_caps_free(it->feat); - it = this->feats.erase(it); - this->meta.num_feats_valid--; - invalid_id = false; - break; - } - } - if (invalid_id) { - ESP_LOGW(TAG, "Invalid id to delete."); - return ESP_FAIL; - } - size_t size = 0; - FILE *f = fopen(this->db_path, "rb+"); - if (f) { - long int offset = sizeof(database_meta) + (sizeof(uint16_t) + sizeof(float) * this->meta.feat_len) * (id - 1); - uint16_t id = 0; - if (fseek(f, offset, SEEK_SET) == 0) { - size = fwrite(&id, sizeof(uint16_t), 1, f); - if (size != 1) { - ESP_LOGE(TAG, "Failed to write feature id."); - fclose(f); - return ESP_FAIL; - } - } else { - ESP_LOGE(TAG, "Failed to seek db file."); - fclose(f); - return ESP_FAIL; - } - - offset = sizeof(uint16_t); - if (fseek(f, offset, SEEK_SET) == 0) { - size = fwrite(&this->meta.num_feats_valid, sizeof(uint16_t), 1, f); - if (size != 1) { - ESP_LOGE(TAG, "Failed to write valid feature num."); - fclose(f); - return ESP_FAIL; - } - } else { - ESP_LOGE(TAG, "Failed to seek db file."); - fclose(f); - return ESP_FAIL; - } - } else { - ESP_LOGE(TAG, "Failed to open db."); - fclose(f); - return ESP_FAIL; - } - fclose(f); - return ESP_OK; -} - -esp_err_t DataBase::delete_last_feat() -{ - if (!this->feats.empty()) { - uint16_t id = this->feats.back().id; - return this->delete_feat(id); - } else { - ESP_LOGW(TAG, "Empty db, nothing to delete"); - return ESP_FAIL; - } -} - -float DataBase::cal_similarity(float *feat1, float *feat2) -{ - float sum = 0; - for (int i = 0; i < this->meta.feat_len; i++) { - sum += feat1[i] * feat2[i]; - } - return sum; -} - -std::list DataBase::query_feat(TensorBase *feat, float thr, int top_k) -{ - std::list res; - if (top_k < 1) { - ESP_LOGW(TAG, "Top_k should be greater than 0."); - return res; - } - float sim; - for (auto it = this->feats.begin(); it != this->feats.end(); it++) { - sim = this->cal_similarity(it->feat, (float *)feat->data); - if (sim <= thr) { - continue; - } - query_info q = {it->id, sim}; - res.insert(std::upper_bound(res.begin(), res.end(), q, greater_query_info), q); - if (res.size() > top_k) - res.pop_back(); - } - return res; -} - -void DataBase::print() -{ - printf("\n"); - printf("[db meta]\nnum_feats_total: %d, num_feats_valid: %d, feat_len: %d\n", - this->meta.num_feats_total, - this->meta.num_feats_valid, - this->meta.feat_len); - printf("[feats]\n"); - for (auto it : this->feats) { - printf("id: %d feat: ", it.id); - for (int i = 0; i < this->meta.feat_len; i++) { - printf("%f, ", it.feat[i]); - } - printf("\n"); - } - printf("\n"); -} - -} // namespace recognition -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_database_base.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_database_base.hpp deleted file mode 100644 index 8256bac7..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_database_base.hpp +++ /dev/null @@ -1,41 +0,0 @@ -#pragma once -#include "dl_recognition_define.hpp" -#include "dl_tensor_base.hpp" -#include "esp_check.h" -#include "esp_system.h" -#include -#include - -namespace dl { -namespace recognition { -class DataBase { -public: - DataBase(const char *name) : name(name) {} - virtual ~DataBase() {} - esp_err_t clear_all_feats(); - esp_err_t enroll_feat(TensorBase *feat); - esp_err_t delete_feat(uint16_t id); - esp_err_t delete_last_feat(); - std::list query_feat(TensorBase *feat, float thr, int top_k); - void print(); - -protected: - char db_path[50]; - const char *name; - database_meta meta; - void init(int feat_len); - void deinit(); - -private: - std::list feats; - - esp_err_t create_empty_database_in_storage(int feat_len); - esp_err_t load_database_from_storage(int feat_len); - void clear_all_feats_in_memory(); - virtual esp_err_t mount() = 0; - virtual esp_err_t unmount() = 0; - virtual esp_err_t check_enough_free_space() = 0; - float cal_similarity(float *feat1, float *feat2); -}; -} // namespace recognition -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_define.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_define.hpp deleted file mode 100644 index 64edcaa1..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_define.hpp +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once -#include - -namespace dl { -namespace recognition { -typedef enum { DB_FATFS_FLASH, DB_FATFS_SDCARD, DB_SPIFFS, DB_MAX = DB_SPIFFS } db_type_t; -typedef struct { - uint16_t num_feats_total; - uint16_t num_feats_valid; - uint16_t feat_len; -} database_meta; - -typedef struct { - uint16_t id; - float *feat; -} database_feat; - -typedef struct { - uint16_t id; - float similarity; -} query_info; - -inline bool greater_query_info(const query_info &a, const query_info &b) -{ - return a.similarity > b.similarity; -} -} // namespace recognition -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_face_image_preprocessor.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_face_image_preprocessor.cpp deleted file mode 100644 index 6345ba42..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_face_image_preprocessor.cpp +++ /dev/null @@ -1,64 +0,0 @@ -#include "dl_recognition_face_image_preprocessor.hpp" - -namespace dl { -namespace recognition { -template -std::vector FaceImagePreprocessor::std_ldks_112 = { - 38.2946, 51.6963, 41.5493, 92.3655, 56.0252, 71.7366, 73.5318, 51.5014, 70.7299, 92.2041}; - -template -template -void FaceImagePreprocessor::preprocess(T *input_element, - const std::vector &input_shape, - const std::vector &landmarks) -{ - assert(landmarks.size() == 10); - // align face - float h_scale = (float)this->image_preprocessor->model_input->shape[1] / 112.0; - float w_scale = (float)this->image_preprocessor->model_input->shape[2] / 112.0; - dl::math::Matrix source_coord(5, 2); - dl::math::Matrix dest_coord(5, 2); - dest_coord.set_value(landmarks); - for (int i = 0; i < source_coord.h; i++) { - source_coord.array[i][0] = w_scale * std_ldks_112[2 * i]; - source_coord.array[i][1] = h_scale * std_ldks_112[2 * i + 1]; - } - dl::math::Matrix M_inv = dl::math::get_similarity_transform(source_coord, dest_coord); - std::vector model_input_shape = {this->image_preprocessor->model_input->shape[1], - this->image_preprocessor->model_input->shape[2], - this->image_preprocessor->model_input->shape[3]}; - if (std::is_same::value) - dl::image::warp_affine(input_element, - input_shape, - (uint8_t *)this->image_preprocessor->model_input->data, - model_input_shape, - &M_inv, - this->byte_swap); - else - dl::image::warp_affine(input_element, - input_shape, - (int16_t *)this->image_preprocessor->model_input->data, - model_input_shape, - &M_inv, - this->byte_swap); - // normalize & quantize - this->image_preprocessor->preprocess((uint8_t *)this->image_preprocessor->model_input->data, model_input_shape); -} - -template void FaceImagePreprocessor::preprocess(uint8_t *input_element, - const std::vector &input_shape, - const std::vector &landmarks); -template void FaceImagePreprocessor::preprocess(uint16_t *input_element, - const std::vector &input_shape, - const std::vector &landmarks); -template void FaceImagePreprocessor::preprocess(uint8_t *input_element, - const std::vector &input_shape, - const std::vector &landmarks); -template void FaceImagePreprocessor::preprocess(uint16_t *input_element, - const std::vector &input_shape, - const std::vector &landmarks); - -template class FaceImagePreprocessor; -template class FaceImagePreprocessor; -} // namespace recognition -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_face_image_preprocessor.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_face_image_preprocessor.hpp deleted file mode 100644 index f79b152b..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_face_image_preprocessor.hpp +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once -#include "dl_image_preprocessor.hpp" -#include "dl_math_matrix.hpp" - -namespace dl { -namespace recognition { -template -class FaceImagePreprocessor { -public: - FaceImagePreprocessor(TensorBase *model_input, - const std::vector &mean, - const std::vector &std, - bool byte_rgb = false, - bool byte_swap = false, - bool use_ppa = false) : - image_preprocessor( - new dl::image::ImagePreprocessor(model_input, mean, std, byte_rgb, byte_swap, use_ppa)), - byte_swap(byte_swap) {}; - - template - void preprocess(T *input_element, const std::vector &input_shape, const std::vector &landmarks); - -private: - static std::vector std_ldks_112; - dl::image::ImagePreprocessor *image_preprocessor; - bool byte_swap; -}; -} // namespace recognition -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_postprocessor.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_postprocessor.cpp deleted file mode 100644 index 6ed9c83e..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_postprocessor.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include "dl_recognition_postprocessor.hpp" - -namespace dl { -namespace recognition { - -template -TensorBase *RecognitionPostprocessor::postprocess(std::map &model_outputs_map) -{ - TensorBase *embedding = model_outputs_map.at("embedding"); - this->feat->assign(embedding); - this->l2_norm(); - return this->feat; -} - -template -void RecognitionPostprocessor::l2_norm() -{ - float norm = 0; - float *ptr = (float *)this->feat->get_element_ptr(); - for (int i = 0; i < this->feat->get_size(); i++) { - norm += (ptr[i] * ptr[i]); - } - norm = dl::math::sqrt_newton(norm); - for (int i = 0; i < this->feat->get_size(); i++) { - ptr[i] /= norm; - } -} - -template class RecognitionPostprocessor; -template class RecognitionPostprocessor; -} // namespace recognition -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_postprocessor.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_postprocessor.hpp deleted file mode 100644 index 97f82a3a..00000000 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/esp-dl/vision/recognition/dl_recognition_postprocessor.hpp +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once -#include "dl_math.hpp" -#include -#include - -namespace dl { -namespace recognition { -template -class RecognitionPostprocessor { -private: - TensorBase *feat; - void l2_norm(); - -public: - RecognitionPostprocessor(TensorBase *model_output) : - feat(new TensorBase(model_output->shape, nullptr, model_output->exponent, DATA_TYPE_FLOAT)) - { - } - TensorBase *postprocess(std::map &model_outputs_map); - ~RecognitionPostprocessor() - { - if (this->feat) { - delete this->feat; - this->feat = nullptr; - } - } -}; -} // namespace recognition -} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/CMakeLists.txt b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/CMakeLists.txt new file mode 100644 index 00000000..9cc51a88 --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/CMakeLists.txt @@ -0,0 +1,30 @@ +set(src_dirs .) + +set(include_dirs .) + +set(requires esp-dl) + +set(HUMAN_FACE_DETECT_MODEL ${BUILD_DIR}/espdl_models/human_face_detect.espdl) + +set(embed_files ${HUMAN_FACE_DETECT_MODEL}) + +idf_component_register(SRC_DIRS ${src_dirs} INCLUDE_DIRS ${include_dirs} REQUIRES ${requires} EMBED_FILES ${embed_files}) + +set(MVMODEL_EXE ${COMPONENT_DIR}/pack_model.py) + +file(MAKE_DIRECTORY ${BUILD_DIR}/espdl_models) + +if(IDF_TARGET STREQUAL "esp32s3") + file(GLOB MODEL_FILE_PATH "${COMPONENT_DIR}/models/*esp32s3.espdl") +elseif (IDF_TARGET STREQUAL "esp32p4") + file(GLOB MODEL_FILE_PATH "${COMPONENT_DIR}/models/*esp32p4.espdl") +endif() + +message(${MODEL_FILE_PATH}) + +add_custom_command( + OUTPUT ${HUMAN_FACE_DETECT_MODEL} + COMMENT "Move and Pack models..." + COMMAND python ${MVMODEL_EXE} --model_path ${MODEL_FILE_PATH} --out_file ${HUMAN_FACE_DETECT_MODEL} + DEPENDS ${CONFIG_MODEL_FILE_PATH} + VERBATIM) diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/LICENSE b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/LICENSE new file mode 100644 index 00000000..dc2be013 --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Espressif Systems (Shanghai) Co., Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/dl_detect_mnp01_postprocessor.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/dl_detect_mnp01_postprocessor.cpp new file mode 100644 index 00000000..f6911a86 --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/dl_detect_mnp01_postprocessor.cpp @@ -0,0 +1,109 @@ +#include "dl_detect_mnp01_postprocessor.hpp" +#include "dl_math.hpp" +#include +#include + +namespace dl { +namespace detect { +template +void MNP01Postprocessor::parse_stage(TensorBase *score, + TensorBase *box, + TensorBase *landmark, + const int stage_index) +{ + std::vector> &anchor_shape = this->stages[stage_index].anchor_shape; + + int H = score->shape[1]; + int W = score->shape[2]; + int A = anchor_shape.size(); + int C = score->shape[3] / A; + feature_t *score_element = (feature_t *)score->get_element_ptr(); + feature_t *box_element = (feature_t *)box->get_element_ptr(); + feature_t *landmark_element = (feature_t *)landmark->get_element_ptr(); + + for (size_t y = 0; y < H; y++) // height + { + for (size_t x = 0; x < W; x++) // width + { + for (size_t a = 0; a < A; a++) // anchor number + { + // softmax + float scores[C]; + scores[0] = score_element[0] * DL_SCALE(score->exponent); + float max_score = scores[0]; + int max_score_c = 0; + for (int i = 1; i < C; i++) { + scores[i] = score_element[i] * DL_SCALE(score->exponent); + if (max_score < scores[i]) { + max_score_c = i; + max_score = scores[i]; + } + } + float sum = 0; + for (int i = 0; i < C; i++) { + sum += expf(scores[i] - max_score); + } + max_score = 1. / sum; + + if (max_score > score_threshold) { + int anchor_h = anchor_shape[a][0]; + int anchor_w = anchor_shape[a][1]; + result_t new_box = { + max_score_c, + max_score, + {(int)(anchor_w * box_element[0] * DL_SCALE(box->exponent) / this->resize_scale_x + + this->top_left_x), + (int)(anchor_h * box_element[1] * DL_SCALE(box->exponent) / this->resize_scale_y + + this->top_left_y), + (int)((anchor_w * box_element[2] * DL_SCALE(box->exponent) + anchor_w) / this->resize_scale_x + + this->top_left_x), + (int)((anchor_h * box_element[3] * DL_SCALE(box->exponent) + anchor_h) / this->resize_scale_y + + this->top_left_y)}, + { + (int)(anchor_w * landmark_element[0] * DL_SCALE(landmark->exponent) / this->resize_scale_x + + this->top_left_x), + (int)(anchor_h * landmark_element[1] * DL_SCALE(landmark->exponent) / this->resize_scale_y + + this->top_left_y), + (int)(anchor_w * landmark_element[2] * DL_SCALE(landmark->exponent) / this->resize_scale_x + + this->top_left_x), + (int)(anchor_h * landmark_element[3] * DL_SCALE(landmark->exponent) / this->resize_scale_y + + this->top_left_y), + (int)(anchor_w * landmark_element[4] * DL_SCALE(landmark->exponent) / this->resize_scale_x + + this->top_left_x), + (int)(anchor_h * landmark_element[5] * DL_SCALE(landmark->exponent) / this->resize_scale_y + + this->top_left_y), + (int)(anchor_w * landmark_element[6] * DL_SCALE(landmark->exponent) / this->resize_scale_x + + this->top_left_x), + (int)(anchor_h * landmark_element[7] * DL_SCALE(landmark->exponent) / this->resize_scale_y + + this->top_left_y), + (int)(anchor_w * landmark_element[8] * DL_SCALE(landmark->exponent) / this->resize_scale_x + + this->top_left_x), + (int)(anchor_h * landmark_element[9] * DL_SCALE(landmark->exponent) / this->resize_scale_y + + this->top_left_y), + }}; + + this->box_list.insert( + std::upper_bound(this->box_list.begin(), this->box_list.end(), new_box, compare_greater_box), + new_box); + } + score_element += C; + box_element += 4; + landmark_element += 10; + } + } + } +} + +template +void MNP01Postprocessor::postprocess() +{ + TensorBase *score = this->get_model_output("score"); + TensorBase *bbox = this->get_model_output("box"); + TensorBase *landmark = this->get_model_output("landmark"); + this->parse_stage(score, bbox, landmark, 0); +} + +template class MNP01Postprocessor; +template class MNP01Postprocessor; +} // namespace detect +} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/dl_detect_mnp01_postprocessor.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/dl_detect_mnp01_postprocessor.hpp new file mode 100644 index 00000000..9b222631 --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/dl_detect_mnp01_postprocessor.hpp @@ -0,0 +1,16 @@ +#pragma once +#include "dl_detect_postprocessor.hpp" + +namespace dl { +namespace detect { +template +class MNP01Postprocessor : public AnchorBoxDetectPostprocessor { +private: + void parse_stage(TensorBase *score, TensorBase *box, TensorBase *landmark, const int stage_index); + +public: + void postprocess() override; + using AnchorBoxDetectPostprocessor::AnchorBoxDetectPostprocessor; +}; +} // namespace detect +} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/dl_detect_msr01_postprocessor.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/dl_detect_msr01_postprocessor.cpp new file mode 100644 index 00000000..c9575051 --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/dl_detect_msr01_postprocessor.cpp @@ -0,0 +1,92 @@ +#include "dl_detect_msr01_postprocessor.hpp" +#include "dl_math.hpp" +#include +#include + +namespace dl { +namespace detect { +template +void MSR01Postprocessor::parse_stage(TensorBase *score, TensorBase *box, const int stage_index) +{ + int stride_y = this->stages[stage_index].stride_y; + int stride_x = this->stages[stage_index].stride_x; + + int offset_y = this->stages[stage_index].offset_y; + int offset_x = this->stages[stage_index].offset_x; + + std::vector> &anchor_shape = this->stages[stage_index].anchor_shape; + + int H = score->shape[1]; + int W = score->shape[2]; + int A = anchor_shape.size(); + int C = score->shape[3] / A; + feature_t *score_element = (feature_t *)score->get_element_ptr(); + feature_t score_threshold_quant; + if (std::is_same::value) { + score_threshold_quant = (feature_t)DL_CLIP( + tool::round(dl::math::inverse_sigmoid(this->score_threshold) / DL_SCALE(score->exponent)), -128, 127); + } else { + score_threshold_quant = (feature_t)DL_CLIP( + tool::round(dl::math::inverse_sigmoid(this->score_threshold) / DL_SCALE(score->exponent)), -32768, 32767); + } + + feature_t *box_element = (feature_t *)box->get_element_ptr(); + + for (size_t y = 0; y < H; y++) // height + { + for (size_t x = 0; x < W; x++) // width + { + for (size_t a = 0; a < A; a++) // anchor number + { + for (size_t c = 0; c < C; c++) // category number + { + if (*score_element > score_threshold_quant) { + int center_y = y * stride_y + offset_y; + int center_x = x * stride_x + offset_x; + int anchor_h = anchor_shape[a][0]; + int anchor_w = anchor_shape[a][1]; + result_t new_box = { + (int)c, + dl::math::sigmoid(*score_element * DL_SCALE(score->exponent)), + {(int)((center_x - (anchor_w >> 1) + anchor_w * box_element[0] * DL_SCALE(box->exponent)) / + this->resize_scale_x), + (int)((center_y - (anchor_h >> 1) + anchor_h * box_element[1] * DL_SCALE(box->exponent)) / + this->resize_scale_y), + (int)((center_x + anchor_w - (anchor_w >> 1) + + anchor_w * box_element[2] * DL_SCALE(box->exponent)) / + this->resize_scale_x), + (int)((center_y + anchor_h - (anchor_h >> 1) + + anchor_h * box_element[3] * DL_SCALE(box->exponent)) / + this->resize_scale_y)}, + {0}}; + + this->box_list.insert( + std::upper_bound( + this->box_list.begin(), this->box_list.end(), new_box, compare_greater_box), + new_box); + } + score_element++; + box_element += 4; + } + } + } + } +} + +template +void MSR01Postprocessor::postprocess() +{ + TensorBase *score0 = this->get_model_output("score0"); + TensorBase *bbox0 = this->get_model_output("box0"); + TensorBase *score1 = this->get_model_output("score1"); + TensorBase *bbox1 = this->get_model_output("box1"); + + this->parse_stage(score0, bbox0, 0); + this->parse_stage(score1, bbox1, 1); + this->nms(); +} + +template class MSR01Postprocessor; +template class MSR01Postprocessor; +} // namespace detect +} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/dl_detect_msr01_postprocessor.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/dl_detect_msr01_postprocessor.hpp new file mode 100644 index 00000000..7e4b0f18 --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/dl_detect_msr01_postprocessor.hpp @@ -0,0 +1,16 @@ +#pragma once +#include "dl_detect_postprocessor.hpp" + +namespace dl { +namespace detect { +template +class MSR01Postprocessor : public AnchorBoxDetectPostprocessor { +private: + void parse_stage(TensorBase *score, TensorBase *box, const int stage_index); + +public: + void postprocess() override; + using AnchorBoxDetectPostprocessor::AnchorBoxDetectPostprocessor; +}; +} // namespace detect +} // namespace dl diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/human_face_detect.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/human_face_detect.cpp new file mode 100644 index 00000000..851bee61 --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/human_face_detect.cpp @@ -0,0 +1,198 @@ +#include "human_face_detect.hpp" + +extern const uint8_t human_face_detect_espdl[] asm("_binary_human_face_detect_espdl_start"); + +HumanFaceDetect::HumanFaceDetect() +{ + this->stage1_model = (void *)new model_zoo::MSR01( + 0.5, + 0.5, + 10, + {{8, 8, 9, 9, {{16, 16}, {32, 32}}}, {16, 16, 9, 9, {{64, 64}, {128, 128}}}}, + {0, 0, 0}, + {1, 1, 1}); + this->stage2_model = + (void *)new model_zoo::MNP01(0.5, 0.5, 10, {{1, 1, 0, 0, {{48, 48}}}}, {0, 0, 0}, {1, 1, 1}); +} + +HumanFaceDetect::~HumanFaceDetect() +{ + if (this->stage1_model) { + delete (model_zoo::MSR01 *)this->stage1_model; + this->stage1_model = nullptr; + } + if (this->stage2_model) { + delete (model_zoo::MNP01 *)this->stage2_model; + this->stage2_model = nullptr; + } +} + +template +std::list &HumanFaceDetect::run(T *input_element, std::vector input_shape) +{ + std::list &candidates = + ((model_zoo::MSR01 *)this->stage1_model)->run(input_element, input_shape); + return ((model_zoo::MNP01 *)this->stage2_model)->run(input_element, input_shape, candidates); +} +template std::list &HumanFaceDetect::run(uint16_t *input_element, std::vector input_shape); +template std::list &HumanFaceDetect::run(uint8_t *input_element, std::vector input_shape); +namespace model_zoo { + +template +MSR01::MSR01(const float score_threshold, + const float nms_threshold, + const int top_k, + const std::vector &stages, + const std::vector &mean, + const std::vector &std) : + model(new dl::Model((const char *)human_face_detect_espdl, fbs::MODEL_LOCATION_IN_FLASH_RODATA, 1)), + postprocessor(new dl::detect::MSR01Postprocessor( + this->model->get_outputs(), score_threshold, nms_threshold, top_k, stages)) +{ + std::map model_inputs_map = this->model->get_inputs(); + assert(model_inputs_map.size() == 1); + dl::TensorBase *model_input = model_inputs_map.begin()->second; + this->image_preprocessor = new dl::image::ImagePreprocessor(model_input, mean, std); +} + +template +MSR01::~MSR01() +{ + if (this->model) { + delete this->model; + this->model = nullptr; + } + if (this->image_preprocessor) { + delete this->image_preprocessor; + this->image_preprocessor = nullptr; + } + if (this->postprocessor) { + delete this->postprocessor; + this->postprocessor = nullptr; + } +} + +template +template +std::list &MSR01::run(T *input_element, std::vector input_shape) +{ + dl::tool::Latency latency[3] = {dl::tool::Latency(), dl::tool::Latency(), dl::tool::Latency()}; + latency[0].start(); + this->image_preprocessor->preprocess(input_element, input_shape); + latency[0].end(); + + latency[1].start(); + this->model->run(); + latency[1].end(); + + latency[2].start(); + this->postprocessor->clear_result(); + this->postprocessor->set_resize_scale_x(this->image_preprocessor->get_resize_scale_x()); + this->postprocessor->set_resize_scale_y(this->image_preprocessor->get_resize_scale_y()); + this->postprocessor->postprocess(); + std::list &result = this->postprocessor->get_result(input_shape); + latency[2].end(); + + latency[0].print("detect", "preprocess"); + latency[1].print("detect", "forward"); + latency[2].print("detect", "postprocess"); + + return result; +} + +template std::list &MSR01::run(uint8_t *input_element, std::vector input_shape); +template std::list &MSR01::run(uint16_t *input_element, std::vector input_shape); +template std::list &MSR01::run(uint8_t *input_element, std::vector input_shape); +template std::list &MSR01::run(uint16_t *input_element, std::vector input_shape); + +template +MNP01::MNP01(const float score_threshold, + const float nms_threshold, + const int top_k, + const std::vector &stages, + const std::vector &mean, + const std::vector &std) : + model(new dl::Model((const char *)human_face_detect_espdl, fbs::MODEL_LOCATION_IN_FLASH_RODATA, 0)), + postprocessor(new dl::detect::MNP01Postprocessor( + this->model->get_outputs(), score_threshold, nms_threshold, top_k, stages)) +{ + std::map model_inputs_map = this->model->get_inputs(); + assert(model_inputs_map.size() == 1); + dl::TensorBase *model_input = model_inputs_map.begin()->second; + this->image_preprocessor = new dl::image::ImagePreprocessor(model_input, mean, std); +} + +template +MNP01::~MNP01() +{ + if (this->model) { + delete this->model; + this->model = nullptr; + } + if (this->image_preprocessor) { + delete this->image_preprocessor; + this->image_preprocessor = nullptr; + } + if (this->postprocessor) { + delete this->postprocessor; + this->postprocessor = nullptr; + } +}; + +template +template +std::list &MNP01::run(T *input_element, + std::vector input_shape, + std::list &candidates) +{ + dl::tool::Latency latency[3] = {dl::tool::Latency(10), dl::tool::Latency(10), dl::tool::Latency(10)}; + this->postprocessor->clear_result(); + for (auto &candidate : candidates) { + int center_x = (candidate.box[0] + candidate.box[2]) >> 1; + int center_y = (candidate.box[1] + candidate.box[3]) >> 1; + int side = DL_MAX(candidate.box[2] - candidate.box[0], candidate.box[3] - candidate.box[1]); + candidate.box[0] = center_x - (side >> 1); + candidate.box[1] = center_y - (side >> 1); + candidate.box[2] = candidate.box[0] + side; + candidate.box[3] = candidate.box[1] + side; + + latency[0].start(); + this->image_preprocessor->preprocess(input_element, input_shape, candidate.box); + latency[0].end(); + + latency[1].start(); + this->model->run(); + latency[1].end(); + + latency[2].start(); + this->postprocessor->set_resize_scale_x(this->image_preprocessor->get_resize_scale_x()); + this->postprocessor->set_resize_scale_y(this->image_preprocessor->get_resize_scale_y()); + this->postprocessor->set_top_left_x(this->image_preprocessor->get_top_left_x()); + this->postprocessor->set_top_left_y(this->image_preprocessor->get_top_left_y()); + this->postprocessor->postprocess(); + latency[2].end(); + } + this->postprocessor->nms(); + std::list &result = this->postprocessor->get_result(input_shape); + if (candidates.size() > 0) { + latency[0].print("detect", "preprocess"); + latency[1].print("detect", "forward"); + latency[2].print("detect", "postprocess"); + } + return result; +} + +template std::list &MNP01::run(uint8_t *input_element, + std::vector input_shape, + std::list &candidates); +template std::list &MNP01::run(uint16_t *input_element, + std::vector input_shape, + std::list &candidates); +template std::list &MNP01::run(uint8_t *input_element, + std::vector input_shape, + std::list &candidates); +template std::list &MNP01::run(uint16_t *input_element, + std::vector input_shape, + std::list &candidates); + +} // namespace model_zoo diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/human_face_detect.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/human_face_detect.hpp new file mode 100644 index 00000000..8218e793 --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/human_face_detect.hpp @@ -0,0 +1,81 @@ +#pragma once + +#include "dl_detect_mnp01_postprocessor.hpp" +#include "dl_detect_msr01_postprocessor.hpp" +#include "dl_image_preprocessor.hpp" +#include "dl_model_base.hpp" + +class HumanFaceDetect { +private: + void *stage1_model; + void *stage2_model; + +public: + /** + * @brief Construct a new HumanFaceDetect object + */ + HumanFaceDetect(); + + /** + * @brief Destroy the HumanFaceDetect object + */ + ~HumanFaceDetect(); + + /** + * @brief Inference. + * + * @tparam T supports uint8_t and uint16_t + * - uint8_t: input image is RGB888 + * - uint16_t: input image is RGB565 + * @param input_element pointer of input image + * @param input_shape shape of input image + * @return detection result + */ + template + std::list &run(T *input_element, std::vector input_shape); +}; +namespace model_zoo { + +template +class MSR01 { +private: + dl::Model *model; + dl::image::ImagePreprocessor *image_preprocessor; + dl::detect::MSR01Postprocessor *postprocessor; + +public: + MSR01(const float score_threshold, + const float nms_threshold, + const int top_k, + const std::vector &stages, + const std::vector &mean, + const std::vector &std); + ~MSR01(); + + template + std::list &run(T *input_element, std::vector input_shape); +}; + +template +class MNP01 { +private: + dl::Model *model; + dl::image::ImagePreprocessor *image_preprocessor; + dl::detect::MNP01Postprocessor *postprocessor; + +public: + MNP01(const float score_threshold, + const float nms_threshold, + const int top_k, + const std::vector &stages, + const std::vector &mean, + const std::vector &std); + ~MNP01(); + + template + std::list &run(T *input_element, + std::vector input_shape, + std::list &candidates); +}; + +} // namespace model_zoo diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/idf_component.yml b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/idf_component.yml new file mode 100644 index 00000000..08c156e5 --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/idf_component.yml @@ -0,0 +1,7 @@ +version: "0.1.0~1" +license: "MIT" +description: human face detect model. +url: https://github.com/espressif/esp-dl/tree/master/models/human_face_detect +dependencies: + espressif/esp-dl: + version: "^3.0.0-rc.1" \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/models/mnp01_esp32p4.espdl b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/models/mnp01_esp32p4.espdl new file mode 100644 index 00000000..f1e8f489 Binary files /dev/null and b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/models/mnp01_esp32p4.espdl differ diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/models/mnp01_esp32s3.espdl b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/models/mnp01_esp32s3.espdl new file mode 100644 index 00000000..e7eede40 Binary files /dev/null and b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/models/mnp01_esp32s3.espdl differ diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/models/msr01_esp32p4.espdl b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/models/msr01_esp32p4.espdl new file mode 100644 index 00000000..4190f44a Binary files /dev/null and b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/models/msr01_esp32p4.espdl differ diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/models/msr01_esp32s3.espdl b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/models/msr01_esp32s3.espdl new file mode 100644 index 00000000..243e2dbd Binary files /dev/null and b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/models/msr01_esp32s3.espdl differ diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/pack_model.py b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/pack_model.py new file mode 100644 index 00000000..8634390f --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/human_face_detect/pack_model.py @@ -0,0 +1,136 @@ +import argparse +import shutil +import struct +from pathlib import Path + + +def struct_pack_string(string, max_len=None): + """ + pack string to binary data. + if max_len is None, max_len = len(string) + 1 + else len(string) < max_len, the left will be padded by struct.pack('x') + + string: input python string + max_len: output + """ + + if max_len == None: + max_len = len(string) + else: + assert len(string) <= max_len + + left_num = max_len - len(string) + out_bytes = None + for char in string: + if out_bytes == None: + out_bytes = struct.pack("b", ord(char)) + else: + out_bytes += struct.pack("b", ord(char)) + for i in range(left_num): + out_bytes += struct.pack("x") + return out_bytes + + +def read_data(filename): + """ + Read binary data, like index and mndata + """ + data = None + with open(filename, "rb") as f: + data = f.read() + return data + + +def pack_models(model_path_or_dir, out_file="models.espdl"): + """ + Pack all models into one binary file by the following format: + { + "PDL1": char[4] + model_num: uint32 + model1_data_offset: uint32 + model1_name_offset: uint32 + model1_name_length: uint32 + model2_data_offset: uint32 + model2_name_offset: uint32 + model2_name_length: uint32 + ... + model1_name, + model2_name, + ... + model1_data, + model2_data, + ... + }model_pack_t + + + model_path: the path of models + out_file: the ouput binary filename + """ + + if len(model_path_or_dir) == 1: + model_path_or_dir = Path(model_path_or_dir[0]) + if model_path_or_dir.is_file(): + shutil.copyfile(model_path_or_dir, out_file) + return + else: + model_files = sorted(list(model_path_or_dir.glob("*.espdl"))) + else: + model_files = [] + for model_path in sorted(model_path_or_dir): + model_path = Path(model_path) + assert model_path.is_file(), "invalid model_path." + model_files.append(model_path) + + model_names = [] + model_bins = [] + name_length = 0 + for model_file in model_files: + model_names.append(model_file.name) + model_bins.append(read_data(model_file)) + name_length += len(model_file.name) + print(model_file.name) + + model_num = len(model_names) + header_bin = struct_pack_string("PDL1", 4) + header_bin += struct.pack("I", model_num) + name_offset = 4 + 4 + model_num * 12 + data_offset = name_offset + name_length + name_bin = None + data_bin = None + for idx, name in enumerate(model_names): + if not name_bin: + name_bin = struct_pack_string(name, len(name)) # + model name + else: + name_bin += struct_pack_string(name, len(name)) + name_offset += len(model_names[idx - 1]) + + if not data_bin: + data_bin = model_bins[idx] + else: + data_bin += model_bins[idx] + data_offset += len(model_bins[idx - 1]) + + header_bin += struct.pack("I", data_offset) + header_bin += struct.pack("I", name_offset) + header_bin += struct.pack("I", len(name)) + out_bin = header_bin + name_bin + data_bin + with open(out_file, "wb") as f: + f.write(out_bin) + + +if __name__ == "__main__": + # input parameter + parser = argparse.ArgumentParser(description="esp-dl model package tool") + parser.add_argument( + "-m", "--model_path", type=str, nargs="+", help="the path of model files" + ) + parser.add_argument( + "-o", + "--out_file", + type=str, + default="models.espdl", + help="the path of binary file", + ) + args = parser.parse_args() + + pack_models(args.model_path, out_file=args.out_file) diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/CMakeLists.txt b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/CMakeLists.txt index 2962839b..2a97aa2c 100644 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/CMakeLists.txt +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/CMakeLists.txt @@ -19,4 +19,4 @@ endif() set(embed_files ${PEDESTRIAN_DETECT_MODEL}) -idf_component_register(SRC_DIRS ${src_dirs} INCLUDE_DIRS ${include_dirs} REQUIRES ${requires} EMBED_FILES ${embed_files}) \ No newline at end of file +idf_component_register(SRC_DIRS ${src_dirs} INCLUDE_DIRS ${include_dirs} REQUIRES ${requires} EMBED_FILES ${embed_files}) diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/LICENSE b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/LICENSE new file mode 100644 index 00000000..dc2be013 --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Espressif Systems (Shanghai) Co., Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/dl_detect_pedestrian_postprocessor.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/dl_detect_pedestrian_postprocessor.cpp index b5fca973..79cd73ef 100644 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/dl_detect_pedestrian_postprocessor.cpp +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/dl_detect_pedestrian_postprocessor.cpp @@ -68,14 +68,14 @@ void PedestrianPostprocessor::parse_stage(TensorBase *score, TensorBa } template -void PedestrianPostprocessor::postprocess(std::map &model_outputs_map) +void PedestrianPostprocessor::postprocess() { - TensorBase *score0 = model_outputs_map.at("score0"); - TensorBase *bbox0 = model_outputs_map.at("bbox0"); - TensorBase *score1 = model_outputs_map.at("score1"); - TensorBase *bbox1 = model_outputs_map.at("bbox1"); - TensorBase *score2 = model_outputs_map.at("score2"); - TensorBase *bbox2 = model_outputs_map.at("bbox2"); + TensorBase *score0 = this->get_model_output("score0"); + TensorBase *bbox0 = this->get_model_output("bbox0"); + TensorBase *score1 = this->get_model_output("score1"); + TensorBase *bbox1 = this->get_model_output("bbox1"); + TensorBase *score2 = this->get_model_output("score2"); + TensorBase *bbox2 = this->get_model_output("bbox2"); this->parse_stage(score0, bbox0, 0); this->parse_stage(score1, bbox1, 1); diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/dl_detect_pedestrian_postprocessor.hpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/dl_detect_pedestrian_postprocessor.hpp index 5ae9ea75..31cb77ce 100644 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/dl_detect_pedestrian_postprocessor.hpp +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/dl_detect_pedestrian_postprocessor.hpp @@ -9,7 +9,7 @@ class PedestrianPostprocessor : public AnchorPointDetectPostprocessor { void parse_stage(TensorBase *score, TensorBase *box, const int stage_index); public: - void postprocess(std::map &model_outputs_map); + void postprocess() override; using AnchorPointDetectPostprocessor::AnchorPointDetectPostprocessor; }; } // namespace detect diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/idf_component.yml b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/idf_component.yml new file mode 100644 index 00000000..3a772d42 --- /dev/null +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/idf_component.yml @@ -0,0 +1,7 @@ +version: "0.1.0~1" +license: "MIT" +description: pedestrian detect model. +url: https://github.com/espressif/esp-dl/tree/master/models/pedestrian_detect +dependencies: + espressif/esp-dl: + version: "^3.0.0-rc.1" \ No newline at end of file diff --git a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/pedestrian_detect.cpp b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/pedestrian_detect.cpp index bcb3ff79..bf44141c 100644 --- a/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/pedestrian_detect.cpp +++ b/esp32-p4-function-ev-board/examples/esp_brookesia_phone/components/pedestrian_detect/pedestrian_detect.cpp @@ -34,7 +34,8 @@ Pedestrian::Pedestrian(const float score_threshold, const std::vector &mean, const std::vector &std) : model(new dl::Model((const char *)pedestrian_espdl)), - postprocessor(new dl::detect::PedestrianPostprocessor(score_threshold, nms_threshold, top_k, stages)) + postprocessor(new dl::detect::PedestrianPostprocessor( + this->model->get_outputs(), score_threshold, nms_threshold, top_k, stages)) { std::map model_inputs_map = this->model->get_inputs(); assert(model_inputs_map.size() == 1); @@ -76,7 +77,7 @@ std::list &Pedestrian::run(T *input_element, co this->postprocessor->clear_result(); this->postprocessor->set_resize_scale_x(this->image_preprocessor->get_resize_scale_x()); this->postprocessor->set_resize_scale_y(this->image_preprocessor->get_resize_scale_y()); - this->postprocessor->postprocess(model->get_outputs()); + this->postprocessor->postprocess(); std::list &result = this->postprocessor->get_result(input_shape); latency[2].end();