From fcc1e5a7656186a8c8de64f15af8eba370a7f4db Mon Sep 17 00:00:00 2001
From: yao <yao.zhang@greenwaves-technologies.com>
Date: Thu, 16 Dec 2021 18:53:38 +0100
Subject: [PATCH 1/3] [SDK] Update SDK

- GAP8: fix the i2s1 issue in freeRTOS
- NNTool: Fixes in ProcessArgPadding, Quantizer generator
---
 configs/common.sh                             |    4 +
 examples/autotiler/BilinearResize/Makefile    |    2 +-
 examples/nntool/mnist_gru/Makefile            |    5 +-
 examples/nntool/mnist_rnn/Makefile            |    3 +
 .../cpu/iss/vp/include/platform_wrapper.hpp   |    2 +-
 .../models/pulp/udma/i2s/udma_i2s_v3.cpp      |   15 +-
 .../models/pulp/udma/udma_v4_addrgens.cpp     |   22 +-
 .../models/pulp/udma/udma_v4_rx_channels.cpp  |    3 +-
 .../models/pulp/udma/udma_v4_tx_channels.cpp  |    5 +-
 libs/gap_lib/testbench/testlib.c              |  376 ++++-
 libs/gap_lib/testbench/testlib.h              |   19 +-
 .../pmsis/drivers/udma/i2s/i2s_internal.c     |    6 +-
 .../vendors/gwt/libs/include/string.h         |    8 +
 .../vendors/gwt/pmsis/include/pmsis.h         |    5 +
 .../gwt/pmsis/rtos/include/pmsis/rtos/os/os.h |    8 +
 .../vendors/gwt/rules/freeRTOS_rules.mk       |    9 +-
 .../pmsis_api/include/pmsis/drivers/i2s.h     |  187 ++-
 rtos/pulp/pulpos-2/include/pmsis.h            |    2 +
 rtos/pulp/pulpos-2/include/pos/data/data.h    |    9 +
 .../pulp/pulpos-2/include/pos/implem/implem.h |    1 +
 rtos/pulp/pulpos-2/rules/pulpos/src.mk        |    2 +-
 testplan.cfg                                  |    2 +
 .../CNN_Generators/CNN_Copy_Generators.c      |   90 +-
 .../CNN_Generators_NE16/CNN_Generators_NE16.c |   90 +-
 .../CNN_Generators_NE16/CNN_Generators_NE16.h |    2 +-
 .../CNN_Generators_SQ8/CNN_Generators_SQ8.c   |  390 ++++-
 .../CNN_Generators_SQ8/CNN_Generators_SQ8.h   |  128 ++
 .../CNN_Generators_fp16/CNN_Generators_fp16.c |  482 ++++--
 .../CNN_Generators_fp16/CNN_Generators_fp16.h |  184 ++-
 tools/autotiler_v3/CNN_Libraries/CNN_Copy.c   |  246 ++-
 tools/autotiler_v3/CNN_Libraries/CNN_Copy.h   |   82 +
 .../CNN_BasicKernels_NE16.c                   |  104 +-
 .../CNN_BasicKernels_NE16.h                   |    4 +-
 .../CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h  |    3 +
 .../CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c    |   34 +-
 .../CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c   | 1313 ++++++++++++-----
 .../CNN_BasicKernels_fp16.h                   |   12 +-
 .../CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c  |   14 +-
 .../CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c |  246 ++-
 .../autotiler_v3/DSP_Libraries/FloatDefines.h |    1 +
 .../LUT_Tables/gen_scripts/GenMFCCLUT.py      |    2 +-
 tools/autotiler_v3/Emulation/GapBuiltins.h    |   21 +
 .../BilinearResizes/ResizeBasicKernels.c      |   85 ++
 .../BilinearResizes/ResizeBasicKernels.h      |   21 +
 .../BilinearResizes/ResizeGenerator.c         |   71 +
 .../BilinearResizes/ResizeGenerator.h         |    1 +
 tools/autotiler_v3/Makefile                   |    2 +-
 tools/autotiler_v3/version.cfg                |    2 +-
 tools/jenkins/gap_sdk_version.txt             |    2 +-
 tools/nntool/.gitignore                       |    3 +
 tools/nntool/generation/code_block.py         |    6 +-
 .../general/resizer_kernel_generator.py       |   13 +-
 .../pow2/conv_pool_relu_kernels_generator.py  |    2 +-
 .../new_generators/general/dsp_generators.py  |    2 +-
 .../general/quantize_parameters.py            |   24 +-
 .../new_generators/mult8/matmul_mult8.py      |   18 +-
 tools/nntool/graph/dim.py                     |    6 +
 .../graph/manipulations/adjust_order.py       |    4 +-
 .../nntool/graph/manipulations/dimensions.py  |    2 +-
 .../eliminate_transposes.py                   |  406 +++--
 .../eliminate_transposes_actions.py           |   73 +-
 .../graph/matches/matchers/move_node_up.py    |   33 +-
 .../graph/matches/matchers/slice_to_split.py  |    2 +-
 tools/nntool/graph/nngraph.py                 |   11 +-
 tools/nntool/graph/types/conv2d.py            |    9 +-
 tools/nntool/graph/types/dsp_preprocessing.py |    8 +-
 tools/nntool/graph/types/input_output.py      |    8 +-
 tools/nntool/graph/types/pooling.py           |   14 +-
 tools/nntool/graph/types/tensor_arithmetic.py |    8 +-
 .../onnx/handlers/backend/conv_mixin.py       |  159 +-
 .../importer/onnx/handlers/backend/gather.py  |   12 +-
 .../onnx/handlers/backend/mat_mul_mixin.py    |    8 +-
 .../onnx/handlers/backend/pad_mixin.py        |    1 +
 .../float/kernels/matrix_operations.py        |    5 +-
 .../float/quantizers/conv_fusion_float.py     |   13 +-
 .../multiplicative/quantizers/filter_mult.py  |   13 +-
 .../symmetric/kernels/dsp_preprocessing.py    |    8 +-
 .../symmetric/kernels/matrix_operations.py    |    6 +-
 tools/nntool/utils/gen_twiddles.py            |   15 +-
 utils/gapy/run.py                             |    5 +
 utils/gapy/runner/board/board_runner.py       |   13 +-
 utils/rules/pulp_rules.mk                     |   10 +-
 82 files changed, 4110 insertions(+), 1137 deletions(-)

diff --git a/configs/common.sh b/configs/common.sh
index 981aff43a..e3d675d5d 100644
--- a/configs/common.sh
+++ b/configs/common.sh
@@ -139,3 +139,7 @@ else
         export XCSIM_PLATFORM=$XCSIM_PATH
     fi
 fi
+
+if [ -f "$GAP_SDK_HOME/configs/wsl.sh" ]; then
+    source $GAP_SDK_HOME/configs/wsl.sh
+fi
diff --git a/examples/autotiler/BilinearResize/Makefile b/examples/autotiler/BilinearResize/Makefile
index f7259a0d4..91e74b484 100644
--- a/examples/autotiler/BilinearResize/Makefile
+++ b/examples/autotiler/BilinearResize/Makefile
@@ -12,7 +12,7 @@ RESIZE_KER_PATH = $(TILER_BILINEAR_RESIZE_KERNEL_PATH)
 APP_SRCS += Bilinear_Resize.c ImgIO.c
 APP_SRCS += ResizeKernels.c $(RESIZE_KER_PATH)/ResizeBasicKernels.c
 
-APP_INC += . $(TILER_EMU_INC) $(TILER_INC) $(RESIZE_KER_PATH)
+APP_INC += . $(TILER_EMU_INC) $(TILER_INC) $(RESIZE_KER_PATH) $(TILER_DSP_KERNEL_PATH) $(TILER_CNN_KERNEL_PATH_FP16)
 
 APP_CFLAGS += -O3 -mno-memcpy -fno-tree-loop-distribute-patterns $(USER_FLAGS)
 APP_CFLAGS +=  -Wno-maybe-uninitialized -Wno-unused-but-set-variable -Wno-unused-variable
diff --git a/examples/nntool/mnist_gru/Makefile b/examples/nntool/mnist_gru/Makefile
index ca053caf2..576063fae 100644
--- a/examples/nntool/mnist_gru/Makefile
+++ b/examples/nntool/mnist_gru/Makefile
@@ -64,7 +64,10 @@ PULP_APP = mnist
 APP = mnist
 APP_SRCS += $(MODEL_PREFIX).c $(MODEL_GEN_C) $(MODEL_COMMON_SRCS) $(CNN_LIB)
 
-APP_CFLAGS += -g -O3 -mno-memcpy -fno-tree-loop-distribute-patterns
+ifeq '$(TARGET_CHIP_FAMILY)' 'GAP8'
+	APP_CFLAGS += -march=rv32imcxgap8
+endif
+APP_CFLAGS += -g -O3 -mno-memcpy -fno-tree-loop-distribute-patterns 
 APP_CFLAGS += -I. -I$(MODEL_COMMON_INC) -I$(TILER_EMU_INC) -I$(TILER_INC) $(CNN_LIB_INCLUDE) -I$(MODEL_BUILD)
 APP_CFLAGS += -DPERF -DAT_MODEL_PREFIX=$(MODEL_PREFIX) $(MODEL_SIZE_CFLAGS)
 APP_CFLAGS += -DSTACK_SIZE=$(CLUSTER_STACK_SIZE) -DSLAVE_STACK_SIZE=$(CLUSTER_SLAVE_STACK_SIZE)
diff --git a/examples/nntool/mnist_rnn/Makefile b/examples/nntool/mnist_rnn/Makefile
index 16ee1c51b..04db45099 100644
--- a/examples/nntool/mnist_rnn/Makefile
+++ b/examples/nntool/mnist_rnn/Makefile
@@ -62,6 +62,9 @@ PULP_APP = mnist
 APP = mnist
 APP_SRCS += $(MODEL_PREFIX).c $(MODEL_GEN_C) $(MODEL_COMMON_SRCS) $(CNN_LIB)
 
+ifeq '$(TARGET_CHIP_FAMILY)' 'GAP8'
+        APP_CFLAGS += -march=rv32imcxgap8
+endif
 APP_CFLAGS += -g -O3 -mno-memcpy -fno-tree-loop-distribute-patterns
 APP_CFLAGS += -I. -I$(MODEL_COMMON_INC) -I$(TILER_EMU_INC) -I$(TILER_INC) $(CNN_LIB_INCLUDE) -I$(MODEL_BUILD)
 APP_CFLAGS += -DPERF -DAT_MODEL_PREFIX=$(MODEL_PREFIX) $(MODEL_SIZE_CFLAGS)
diff --git a/gvsoc/gvsoc/models/cpu/iss/vp/include/platform_wrapper.hpp b/gvsoc/gvsoc/models/cpu/iss/vp/include/platform_wrapper.hpp
index a386c3768..f1358d239 100644
--- a/gvsoc/gvsoc/models/cpu/iss/vp/include/platform_wrapper.hpp
+++ b/gvsoc/gvsoc/models/cpu/iss/vp/include/platform_wrapper.hpp
@@ -170,7 +170,7 @@ static inline int iss_fetch_req_common(iss_t *_this, uint64_t addr, uint8_t *dat
   if (err != vp::IO_REQ_OK)
   {
     if (err == vp::IO_REQ_INVALID)
-      _this->trace.fatal("Invalid fetch request (addr: 0x%x, size: 0x%x)\n", addr, size);
+      _this->trace.force_warning("Invalid fetch request (addr: 0x%x, size: 0x%x)\n", addr, size);
     else
     {
       iss_exec_insn_stall(_this);
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/i2s/udma_i2s_v3.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/i2s/udma_i2s_v3.cpp
index 3296d1307..8eecde9f1 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/i2s/udma_i2s_v3.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/i2s/udma_i2s_v3.cpp
@@ -239,29 +239,32 @@ void I2s_periph::handle_sdo(bool no_restart)
                 {
                     this->tx_wait_data_init &= ~(1 << this->active_channel);
                     data = this->tx_fifo[this->active_channel].front();
-
                     this->tx_fifo[this->active_channel].pop();
                 }
                 else
                 {
                     if (((this->tx_wait_data_init >> this->active_channel) & 1) == 0)
                     {
+                        this->trace.msg(vp::trace::LEVEL_DEBUG, "Generating TX error (slot: %d)\n", this->active_channel);
                         this->regmap.err_status.set(this->regmap.err_status.get() | (1 << (this->active_channel + 16)));
                     }
                 }
 
                 this->tx_pending_value = this->handle_tx_format(channel, data);
 
-                this->trace.msg(vp::trace::LEVEL_DEBUG, "Got new TX sample (value: 0x%x, width: %d)\n", this->tx_pending_value, this->tx_pending_bits);
-
+                this->trace.msg(vp::trace::LEVEL_DEBUG, "Got new TX sample (slot: %d, value: 0x%x, width: %d)\n", this->active_channel, this->tx_pending_value, this->tx_pending_bits);
 
                 // Now ask the next sample to the channel so that it is ready for the
                 // next frame.
                 // The channel id is pushed now since we the samples are received in order
                 // and we don't know what is the slot when we receive the sample from the channel
 
-                this->tx_fifo_slot_id.push(this->active_channel);
-                channel->get_data(channel->slot_cfg->tx_dsize_get() + 1, channel->slot_cfg->tx_id_get());
+                if (channel->is_active() || channel->slot_cfg->tx_id_get() >= 0xe0)
+                {
+                    this->tx_fifo_slot_id.push(this->active_channel);
+
+                    channel->get_data(channel->slot_cfg->tx_dsize_get() + 1, channel->slot_cfg->tx_id_get());
+                }
             }
 
             if (this->tx_pending_bits > 0)
@@ -1065,7 +1068,7 @@ void I2s_tx_channel::push_data(uint8_t *data, int size)
     uint32_t value = 0;
     memcpy((void *)&value, (void *)data, size);
 
-    this->periph->trace.msg(vp::trace::LEVEL_INFO, "Received TX sample from memory (value: 0x%x)\n", value);
+    this->periph->trace.msg(vp::trace::LEVEL_INFO, "Received TX sample from memory (slot: %d, value: 0x%x)\n", this->periph->tx_fifo_slot_id.front(), value);
 
     this->periph->tx_fifo[this->periph->tx_fifo_slot_id.front()].push(value);
     this->periph->tx_fifo_slot_id.pop();
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_addrgens.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_addrgens.cpp
index a427b1787..43d48b9cb 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_addrgens.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_addrgens.cpp
@@ -75,7 +75,7 @@ void Udma_addrgen_linear::check_pending_transfer()
 {
     if (!this->active_transfer && this->nb_pending_transfers)
     {
-        trace.msg(vp::trace::LEVEL_TRACE, "Starting new buffer (addr: 0x%x, size: 0x%x)\n", this->pending_addr, this->pending_size);
+        trace.msg(vp::trace::LEVEL_TRACE, "Starting new buffer (addr: 0x%x, size: 0x%x, pending: %d)\n", this->pending_addr, this->pending_size, this->nb_pending_transfers);
 
         this->nb_pending_transfers--;
         this->set_active_transfer(true);
@@ -101,17 +101,25 @@ void Udma_addrgen_linear::cfg_ctrl_req(uint64_t reg_offset, int size, uint8_t *v
         }
         else if (this->regmap.cfg_ctrl.en_get())
         {
+            this->trace.msg(vp::trace::LEVEL_TRACE, "Enqeueing transfer (pending: %d)\n", this->nb_pending_transfers);
 
-            if (this->nb_pending_transfers > 0)
+            if (this->nb_pending_transfers == 1)
             {
-                this->remaining_size -= this->pending_size;
+                this->trace.force_warning("Trying to enqueue while alreay 2 transfers are enqueued\n");
             }
+            else
+            {
+                if (this->nb_pending_transfers > 0)
+                {
+                    this->remaining_size -= this->pending_size;
+                }
 
-            this->pending_addr = this->regmap.cfg_sa_buf0.get();
-            this->pending_size = this->regmap.cfg_size.get();
-            this->remaining_size += this->pending_size;
+                this->pending_addr = this->regmap.cfg_sa_buf0.get();
+                this->pending_size = this->regmap.cfg_size.get();
+                this->remaining_size += this->pending_size;
 
-            this->nb_pending_transfers++;
+                this->nb_pending_transfers++;
+            }
 
             this->check_pending_transfer();
 
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_rx_channels.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_rx_channels.cpp
index fc2fb58b0..9553e0ccc 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_rx_channels.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_rx_channels.cpp
@@ -186,8 +186,7 @@ void Udma_rx_channels::handle_pending(void *__this, vp::clock_event *event)
 
                 if (addr == 0)
                 {
-                    fflush(NULL);
-                    abort();
+                    _this->top->trace.fatal("UDMA trying to access NULL\n");
                 }
 
                 if (err == vp::IO_REQ_OK)
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_tx_channels.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_tx_channels.cpp
index 90e97e6d4..202cd2f10 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_tx_channels.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_tx_channels.cpp
@@ -83,7 +83,7 @@ void Udma_tx_channel::get_data(int size, int channel)
 {
     this->requested_size_queue.push_back(size);
     this->requested_size += size;
-    if (this->requested_size_queue.size() == 1 && this->is_active() && !this->enqueued)
+    if (this->requested_size_queue.size() > 0 && this->is_active() && !this->enqueued)
     {
         this->top->tx_channels->push_ready_channel(this);
         this->check_state();
@@ -243,7 +243,7 @@ void Udma_tx_channels::handle_pending(void *__this, vp::clock_event *event)
             channel->requested_size_queue.push_front(requested_size_queue - size);
         }
 
-        if (channel->requested_size_queue.size() && !channel->enqueued)
+        if (channel->requested_size_queue.size() && !channel->enqueued && channel->is_active())
         {
             _this->push_ready_channel(channel);
         }
@@ -307,7 +307,6 @@ void Udma_tx_channels::check_state()
 {
     if (!this->send_reqs_event->is_enqueued())
     {
-        if (!this->pending_channels.empty() && !this->l2_free_reqs->is_empty())
         if (!this->pending_channels.empty() && !this->l2_free_reqs->is_empty() && this->pending_channels.front()->is_active())
         {
             this->top->event_enqueue(this->send_reqs_event, 1);
diff --git a/libs/gap_lib/testbench/testlib.c b/libs/gap_lib/testbench/testlib.c
index 5cfd79862..1619bdc2a 100644
--- a/libs/gap_lib/testbench/testlib.c
+++ b/libs/gap_lib/testbench/testlib.c
@@ -185,7 +185,14 @@ void i2s_slot_deinit(i2s_slot_test_t *i2s_slot)
         struct pi_i2s_channel_conf i2s_conf;
         pi_i2s_channel_conf_init(&i2s_conf);
         i2s_conf.options = PI_I2S_OPT_DISABLED | (i2s_slot->is_rx ? PI_I2S_OPT_IS_RX: PI_I2S_OPT_IS_TX);
-        pi_i2s_channel_conf_set(i2s_slot->i2s, i2s_slot->slot, &i2s_conf);
+        if (i2s_slot->frame)
+        {
+            pi_i2s_frame_channel_conf_set(i2s_slot->i2s, i2s_slot->frame, i2s_slot->slot, &i2s_conf);
+        }
+        else
+        {
+            pi_i2s_channel_conf_set(i2s_slot->i2s, i2s_slot->slot, &i2s_conf);
+        }
 
         pi_l2_free(i2s_slot->buffers[0], i2s_slot->buffer_size);
         pi_l2_free(i2s_slot->buffers[1], i2s_slot->buffer_size);
@@ -193,7 +200,12 @@ void i2s_slot_deinit(i2s_slot_test_t *i2s_slot)
 }
 
 
-int i2s_slot_init(i2s_slot_test_t *i2s_slot, struct pi_device *i2s, i2s_slot_config_t *slot_config)
+void i2s_slot_new(i2s_slot_test_t *i2s_slot)
+{
+    pi_task_block(&i2s_slot->end_task);
+}
+
+int i2s_slot_init(i2s_test_t *test, i2s_slot_test_t *i2s_slot, struct pi_device *i2s, i2s_slot_config_t *slot_config)
 {
     i2s_slot->i2s = i2s;
     i2s_slot->itf = slot_config->itf;
@@ -209,10 +221,12 @@ int i2s_slot_init(i2s_slot_test_t *i2s_slot, struct pi_device *i2s, i2s_slot_con
     i2s_slot->random_mute = slot_config->random_mute;
     i2s_slot->format = slot_config->format;
     i2s_slot->bypass = slot_config->bypass;
+    i2s_slot->frame = slot_config->frame;
+    i2s_slot->test = test;
+    i2s_slot->flags.use_slab = slot_config->slab != 0;
 
     if (!i2s_slot->bypass || !i2s_slot->is_rx)
     {
-        pi_task_block(&i2s_slot->end_task);
 
         struct pi_i2s_channel_conf i2s_conf;
         pi_i2s_channel_conf_init(&i2s_conf);
@@ -229,21 +243,85 @@ int i2s_slot_init(i2s_slot_test_t *i2s_slot, struct pi_device *i2s, i2s_slot_con
         {
             i2s_conf.options = PI_I2S_OPT_PINGPONG | PI_I2S_OPT_IS_TX | PI_I2S_OPT_ENABLED;
         }
-        i2s_slot->buffers[0] = pi_l2_malloc(buffer_size);
-        if (i2s_slot->buffers[0] == NULL)
+
+        if (slot_config->slab)
         {
-            printf("Failed to allocate\n");
-            return -1;
+            if (slot_config->frame)
+            {
+                if (__FF1(slot_config->frame) == i2s_slot->slot)
+                {
+                    int nb_slots = __builtin_popcount(slot_config->frame);
+
+                    void *buffers = pi_l2_malloc(buffer_size * nb_slots * slot_config->slab);
+                    if (buffers == NULL)
+                    {
+                        printf("Failed to allocate\n");
+                        return -1;
+                    }
+                    
+                    pi_mem_slab_init(&i2s_slot->slab, buffers, buffer_size * nb_slots, slot_config->slab);
+                }
+            }
+            else
+            {
+                void *buffers = pi_l2_malloc(buffer_size * slot_config->slab);
+                if (buffers == NULL)
+                {
+                    printf("Failed to allocate\n");
+                    return -1;
+                }
+                
+                pi_mem_slab_init(&i2s_slot->slab, buffers, buffer_size, slot_config->slab);
+            }
         }
-        i2s_slot->buffers[1] = pi_l2_malloc(buffer_size);
-        if (i2s_slot->buffers[1] == NULL)
+        else
         {
-            printf("Failed to allocate\n");
-            return -1;
+            if (slot_config->frame)
+            {
+                if (__FF1(slot_config->frame) == i2s_slot->slot)
+                {
+                    int nb_slots = __builtin_popcount(slot_config->frame);
+
+                    i2s_slot->buffers[0] = pi_l2_malloc(buffer_size*nb_slots);
+                    if (i2s_slot->buffers[0] == NULL)
+                    {
+                        printf("Failed to allocate\n");
+                        return -1;
+                    }
+                    i2s_slot->buffers[1] = pi_l2_malloc(buffer_size*nb_slots);
+                    if (i2s_slot->buffers[1] == NULL)
+                    {
+                        printf("Failed to allocate\n");
+                        return -1;
+                    }
+                }
+            }
+            else
+            {
+                i2s_slot->buffers[0] = pi_l2_malloc(buffer_size);
+                if (i2s_slot->buffers[0] == NULL)
+                {
+                    printf("Failed to allocate\n");
+                    return -1;
+                }
+                i2s_slot->buffers[1] = pi_l2_malloc(buffer_size);
+                if (i2s_slot->buffers[1] == NULL)
+                {
+                    printf("Failed to allocate\n");
+                    return -1;
+                }
+            }
         }
 
-        i2s_conf.pingpong_buffers[0] = i2s_slot->buffers[0];
-        i2s_conf.pingpong_buffers[1] = i2s_slot->buffers[1];
+        if (slot_config->slab)
+        {
+            i2s_conf.mem_slab = &i2s_slot->slab;
+        }
+        else
+        {
+            i2s_conf.pingpong_buffers[0] = i2s_slot->buffers[0];
+            i2s_conf.pingpong_buffers[1] = i2s_slot->buffers[1];
+        }
 
         i2s_conf.block_size = buffer_size;
         i2s_conf.word_size = slot_config->word_size;
@@ -287,8 +365,16 @@ int i2s_slot_init(i2s_slot_test_t *i2s_slot, struct pi_device *i2s, i2s_slot_con
             i2s_conf.slot_enable = 0;
         }
 
-        if (pi_i2s_channel_conf_set(i2s, slot_config->slot, &i2s_conf))
-            return -1;
+        if (slot_config->frame)
+        {
+            if (pi_i2s_frame_channel_conf_set(i2s, slot_config->frame, slot_config->slot, &i2s_conf))
+                return -1;
+        }
+        else
+        {
+            if (pi_i2s_channel_conf_set(i2s, slot_config->slot, &i2s_conf))
+                return -1;
+        }
 
         if (slot_config -> ts_evt_en)
         {
@@ -343,14 +429,53 @@ void i2s_slot_callback_tx_file_dumper(void *arg)
 
     if (i2s_slot->nb_sample > 0 || i2s_slot->nb_sample == -1)
     {
-        void *buffer = i2s_slot->tx_buffers[i2s_slot->tx_buffer];
-        i2s_slot->tx_buffer ^= 1;
-        for (int i=0; i<i2s_slot->nb_elem; i++)
+        void *buffer;
+        
+        if (i2s_slot->flags.use_slab)
+        {
+            pi_mem_slab_alloc(&i2s_slot->slab, (void **)&buffer, 0);
+        }
+        else
         {
-            set_buffer_elem_iter(i2s_slot, buffer, i);
+            buffer = i2s_slot->tx_buffers[i2s_slot->tx_buffer];
+            i2s_slot->tx_buffer ^= 1;
         }
 
-        int err = pi_i2s_channel_write_async(i2s_slot->i2s, i2s_slot->slot, NULL, i2s_slot->buffer_size, pi_task_callback(&i2s_slot->task, i2s_slot_callback_tx_file_dumper, (void *)i2s_slot));
+        if (i2s_slot->frame)
+        {
+            uint32_t frame = i2s_slot->frame;
+            uint32_t addr = (uint32_t)buffer;
+            while(frame)
+            {
+                int slot_id = __FF1(frame);
+                frame = __BITCLR_R(frame, 1, slot_id);
+
+                for (int i=0; i<i2s_slot->nb_elem; i++)
+                {
+                    set_buffer_elem_iter(&i2s_slot->test->slot_test_tx[slot_id], (void *)addr, i);
+                }
+
+                addr = addr + i2s_slot->buffer_size;
+            }
+        }
+        else
+        {
+            for (int i=0; i<i2s_slot->nb_elem; i++)
+            {
+                set_buffer_elem_iter(i2s_slot, buffer, i);
+            }
+        }
+
+
+
+        if (i2s_slot->frame)
+        {
+            int err = pi_i2s_frame_write_async(i2s_slot->i2s, i2s_slot->frame, buffer, i2s_slot->buffer_size, pi_task_callback(&i2s_slot->task, i2s_slot_callback_tx_file_dumper, (void *)i2s_slot));
+        }
+        else
+        {
+            int err = pi_i2s_channel_write_async(i2s_slot->i2s, i2s_slot->slot, buffer, i2s_slot->buffer_size, pi_task_callback(&i2s_slot->task, i2s_slot_callback_tx_file_dumper, (void *)i2s_slot));
+        }
     }
     else
     {
@@ -389,18 +514,8 @@ static uint32_t buffer_get_elem(void *buffer, int index, int word_size, void **a
     }
 }
 
-void i2s_slot_callback_rx_iter(void *arg)
+int i2s_slot_callback_rx_iter_check(i2s_slot_test_t *i2s_slot, void *chunk, int size)
 {
-    i2s_slot_test_t *i2s_slot = (i2s_slot_test_t *)arg;
-    void *chunk;
-    int size;
-
-    if (pi_i2s_read_status(&i2s_slot->task, &chunk, (size_t *)&size))
-    {
-        i2s_slot->retval++;
-        goto end;
-    }
-
     int nb_elem = size / i2s_slot->elem_size;
 
     for (int i=0; i<nb_elem && i2s_slot->nb_sample > 0; i++)
@@ -428,15 +543,65 @@ void i2s_slot_callback_rx_iter(void *arg)
         {
             printf("Detected error (itf: %d, slot: %d, index: %d, nb_elem: %d, expected: 0x%x, got: 0x%x, address: %p)\n", i2s_slot->itf, i2s_slot->slot, i, nb_elem, current_value, value, address);
             i2s_slot->retval++;
-            goto end;
+            return 1;
         }
 
         i2s_slot->nb_sample--;
         if (i2s_slot->nb_sample == 0)
+            return 1;
+    }
+
+    return 0;
+}
+
+
+void i2s_slot_callback_rx_iter(void *arg)
+{
+    i2s_slot_test_t *i2s_slot = (i2s_slot_test_t *)arg;
+    void *chunk;
+    int size;
+
+    if (pi_i2s_read_status(&i2s_slot->task, &chunk, (size_t *)&size))
+    {
+        i2s_slot->retval++;
+        goto end;
+    }
+
+    if (i2s_slot->frame)
+    {
+        uint32_t frame = i2s_slot->frame;
+        uint32_t buffer = (uint32_t)chunk;
+        while(frame)
+        {
+            int slot_id = __FF1(frame);
+            frame = __BITCLR_R(frame, 1, slot_id);
+
+            if (i2s_slot_callback_rx_iter_check(&i2s_slot->test->slot_test_rx[slot_id], (void *)buffer, size))
+                goto end;
+
+            buffer += size;
+        }
+        
+        pi_i2s_frame_read_async(i2s_slot->i2s, i2s_slot->frame, pi_task_callback(&i2s_slot->task, i2s_slot_callback_rx_iter, (void *)i2s_slot));
+        
+        if (i2s_slot->flags.use_slab)
+        {
+            pi_mem_slab_free(&i2s_slot->slab, &chunk);
+        }
+    }
+    else
+    {
+        if (i2s_slot_callback_rx_iter_check(i2s_slot, chunk, size))
             goto end;
+
+        pi_i2s_channel_read_async(i2s_slot->i2s, i2s_slot->slot, pi_task_callback(&i2s_slot->task, i2s_slot_callback_rx_iter, (void *)i2s_slot));
+
+        if (i2s_slot->flags.use_slab)
+        {
+            pi_mem_slab_free(&i2s_slot->slab, &chunk);
+        }
     }
 
-    pi_i2s_channel_read_async(i2s_slot->i2s, i2s_slot->slot, pi_task_callback(&i2s_slot->task, i2s_slot_callback_rx_iter, (void *)i2s_slot));
     return;
 
 end:
@@ -471,7 +636,17 @@ int i2s_slot_start(i2s_slot_test_t *i2s_slot, i2s_slot_start_config_t *config)
             if (i2s_slot->incr_value >= i2s_slot->incr_end)
                 i2s_slot->incr_value = 0;
 
-            pi_i2s_channel_read_async(i2s_slot->i2s, i2s_slot->slot, pi_task_callback(&i2s_slot->task, i2s_slot_callback_rx_iter, (void *)i2s_slot));
+            if (i2s_slot->frame == 0 || __FF1(i2s_slot->frame) == i2s_slot->slot)
+            {
+                if (i2s_slot->frame)
+                {
+                    pi_i2s_frame_read_async(i2s_slot->i2s, i2s_slot->frame, pi_task_callback(&i2s_slot->task, i2s_slot_callback_rx_iter, (void *)i2s_slot));
+                }
+                else
+                {
+                    pi_i2s_channel_read_async(i2s_slot->i2s, i2s_slot->slot, pi_task_callback(&i2s_slot->task, i2s_slot_callback_rx_iter, (void *)i2s_slot));
+                }
+            }
         }
         else if (config->type == I2S_VERIF_TX_ITER)
         {
@@ -484,16 +659,92 @@ int i2s_slot_start(i2s_slot_test_t *i2s_slot, i2s_slot_start_config_t *config)
             if (i2s_slot->incr_value >= i2s_slot->incr_end)
                 i2s_slot->incr_value = 0;
 
-            for (int i=0; i<i2s_slot->nb_elem; i++)
+            void *buffers[2] = {0};
+
+            if (i2s_slot->frame)
             {
-                set_buffer_elem_iter(i2s_slot, i2s_slot->tx_buffers[0], i);
+                if (__FL1(i2s_slot->frame) == i2s_slot->slot)
+                {
+                    i2s_slot_test_t *first_slot = &i2s_slot->test->slot_test_tx[__FF1(i2s_slot->frame)];
+                    uint32_t frame = i2s_slot->frame;
+                    if (i2s_slot->flags.use_slab)
+                    {
+                        pi_mem_slab_alloc(&first_slot->slab, (void **)&buffers[0], 0);
+                        pi_mem_slab_alloc(&first_slot->slab, (void **)&buffers[1], 0);
+                    }
+                    else
+                    {
+                        buffers[0] = first_slot->tx_buffers[0];
+                        buffers[1] = first_slot->tx_buffers[1];
+                    }
+
+                    uint32_t buffer0 = (uint32_t)buffers[0];
+                    uint32_t buffer1 = (uint32_t)buffers[1];
+
+                    while(frame)
+                    {
+                        int slot_id = __FF1(frame);
+                        frame = __BITCLR_R(frame, 1, slot_id);
+
+                        i2s_slot_test_t *frame_slot = &i2s_slot->test->slot_test_tx[slot_id];
+
+
+                        for (int i=0; i<i2s_slot->nb_elem; i++)
+                        {
+                            set_buffer_elem_iter(frame_slot, (void *)buffer0, i);
+                        }
+
+                        for (int i=0; i<i2s_slot->nb_elem; i++)
+                        {
+                            set_buffer_elem_iter(frame_slot, (void *)buffer1, i);
+                        }
+
+                        buffer0 += i2s_slot->buffer_size;
+                        buffer1 += i2s_slot->buffer_size;
+                    }
+                }
             }
-            for (int i=0; i<i2s_slot->nb_elem; i++)
+            else
             {
-                set_buffer_elem_iter(i2s_slot, i2s_slot->tx_buffers[1], i);
+                if (i2s_slot->flags.use_slab)
+                {
+                    pi_mem_slab_alloc(&i2s_slot->slab, (void **)&buffers[0], 0);
+                    pi_mem_slab_alloc(&i2s_slot->slab, (void **)&buffers[1], 0);
+                }
+                else
+                {
+                    buffers[0] = i2s_slot->tx_buffers[0];
+                    buffers[1] = i2s_slot->tx_buffers[1];
+                }
+
+                if (i2s_slot->incr_value >= i2s_slot->incr_end)
+                    i2s_slot->incr_value = 0;  
+
+                for (int i=0; i<i2s_slot->nb_elem; i++)
+                {
+                    set_buffer_elem_iter(i2s_slot, buffers[0], i);
+                }
+                for (int i=0; i<i2s_slot->nb_elem; i++)
+                {
+                    set_buffer_elem_iter(i2s_slot, buffers[1], i);
+                }
             }
 
-            pi_i2s_channel_write_async(i2s_slot->i2s, i2s_slot->slot, NULL, i2s_slot->buffer_size, pi_task_callback(&i2s_slot->task, i2s_slot_callback_tx_file_dumper, (void *)i2s_slot));
+            if (i2s_slot->frame == 0 || __FL1(i2s_slot->frame) == i2s_slot->slot)
+            {
+                if (i2s_slot->frame)
+                {
+                    i2s_slot_test_t *first_slot = &i2s_slot->test->slot_test_tx[__FF1(i2s_slot->frame)];
+                    pi_i2s_frame_write_async(first_slot->i2s, first_slot->frame, buffers[0], first_slot->buffer_size, pi_task_callback(&first_slot->task, i2s_slot_callback_tx_file_dumper, (void *)first_slot));
+                    pi_i2s_frame_write_async(first_slot->i2s, first_slot->frame, buffers[1], first_slot->buffer_size, pi_task_callback(&first_slot->task, i2s_slot_callback_tx_file_dumper, (void *)first_slot));
+
+                }
+                else
+                {
+                    pi_i2s_channel_write_async(i2s_slot->i2s, i2s_slot->slot, buffers[0], i2s_slot->buffer_size, pi_task_callback(&i2s_slot->task, i2s_slot_callback_tx_file_dumper, (void *)i2s_slot));
+                    pi_i2s_channel_write_async(i2s_slot->i2s, i2s_slot->slot, buffers[1], i2s_slot->buffer_size, pi_task_callback(&i2s_slot->task, i2s_slot_callback_tx_file_dumper, (void *)i2s_slot));
+                }
+            }
         }
     }
 
@@ -506,7 +757,7 @@ int i2s_slot_start(i2s_slot_test_t *i2s_slot, i2s_slot_start_config_t *config)
 
 int i2s_slot_wait(i2s_slot_test_t *i2s_slot)
 {
-    if (i2s_slot->bypass)
+    if (i2s_slot->bypass || (i2s_slot->frame && i2s_slot->slot != __FF1(i2s_slot->frame)))
     {
         return 0;
     }
@@ -557,8 +808,13 @@ int i2s_test_init(i2s_test_t *test, i2s_test_config_t *config)
 
     fifo_id = config->fifo_id;
 
+    uint32_t rx_frames[4] = { (config->rx_frames >> 0) & 0xffff, (config->rx_frames >> 16) & 0xffff, (config->rx_frames >> 32) & 0xffff, (config->rx_frames >> 48) & 0xffff};
+    uint32_t tx_frames[4] = { (config->tx_frames >> 0) & 0xffff, (config->tx_frames >> 16) & 0xffff, (config->tx_frames >> 32) & 0xffff, (config->tx_frames >> 48) & 0xffff};
+
     for (int i=0; i<16; i++)
     {
+        uint32_t slab = (config->rx_slabs >> (i*4)) & 0xf;
+
         test->slot_test_rx[i].i2s = NULL;
 
         if ((config->rx_slots >> i) & 1)
@@ -567,14 +823,25 @@ int i2s_test_init(i2s_test_t *test, i2s_test_config_t *config)
             int slot_format = (config->rx_slots_format >> (i*4)) & 0xF;
             int random_mute = (config->random_mute >> i) & 1;
             int ts_evt_en = (config->ts_evt >> i) & 1;
+            uint32_t frame = 0;
+
+            for (int j=0; j<4; j++)
+            {
+                if (((rx_frames[j] >> i) & 1) != 0)
+                {
+                    frame = rx_frames[j];
+                }
+            }
 
             i2s_slot_config_t i2s_slot_config = {
                 .itf=config->itf, .slot=i, .is_rx=1, .word_size=slot_width, .nb_elem=config->buffer_nb_elem, .elem_size=config->elem_size, .format=slot_format,
-                .mute_delay_start=30, .mute_delay_incr=20, .mute_delay_end=150, .random_mute=random_mute, .ts_evt_en=ts_evt_en
+                .mute_delay_start=30, .mute_delay_incr=20, .mute_delay_end=150, .random_mute=random_mute, .ts_evt_en=ts_evt_en, .frame=frame, .slab=slab
 
             };
 
-            if (i2s_slot_init(&test->slot_test_rx[i], &test->i2s, &i2s_slot_config))
+            i2s_slot_new(&test->slot_test_rx[i]);
+
+            if (i2s_slot_init(test, &test->slot_test_rx[i], &test->i2s, &i2s_slot_config))
                 return -1;
     
             int iter = config->nb_slots;
@@ -600,6 +867,8 @@ int i2s_test_init(i2s_test_t *test, i2s_test_config_t *config)
 
     for (int i=0; i<16; i++)
     {
+        uint32_t slab = (config->tx_slabs >> (i*4)) & 0xf;
+
         test->slot_test_tx[i].i2s = NULL;
 
         if ((config->tx_slots >> i) & 1)
@@ -609,15 +878,32 @@ int i2s_test_init(i2s_test_t *test, i2s_test_config_t *config)
             int random_mute = (config->random_mute >> i) & 1;
             int ts_evt_en = (config->ts_evt >> (16+i)) & 1;
             int bypass = (config->tx_slots_bypass >> i) & 1;
+            uint32_t frame = 0;
+
+            for (int j=0; j<4; j++)
+            {
+                if (((tx_frames[j] >> i) & 1) != 0)
+                {
+                    frame = tx_frames[j];
+                }
+            }
 
             i2s_slot_config_t i2s_slot_config = {
                 .itf=config->itf, .slot=i, .is_rx=0, .word_size=slot_width, .nb_elem=config->buffer_nb_elem, .elem_size=config->elem_size, .format=slot_format,
-                .mute_delay_start=30, .mute_delay_incr=20, .mute_delay_end=150, .random_mute=random_mute, .ts_evt_en=ts_evt_en, .bypass=bypass
+                .mute_delay_start=30, .mute_delay_incr=20, .mute_delay_end=150, .random_mute=random_mute, .ts_evt_en=ts_evt_en, .bypass=bypass, .frame=frame, .slab=slab
             };
 
-            if (i2s_slot_init(&test->slot_test_tx[i], &test->i2s, &i2s_slot_config))
+            i2s_slot_new(&test->slot_test_tx[i]);
+
+            if (i2s_slot_init(test, &test->slot_test_tx[i], &test->i2s, &i2s_slot_config))
                 return -1;
-    
+        }
+    }
+
+    for (int i=0; i<16; i++)
+    {
+        if ((config->tx_slots >> i) & 1)
+        {
             int iter = config->nb_slots;
             if (config->full_duplex)
             {
diff --git a/libs/gap_lib/testbench/testlib.h b/libs/gap_lib/testbench/testlib.h
index 982e59785..1526893cc 100644
--- a/libs/gap_lib/testbench/testlib.h
+++ b/libs/gap_lib/testbench/testlib.h
@@ -16,6 +16,8 @@
 
 #define I2S_SLOT_STATIC_INIT {0}
 
+typedef struct i2s_test_s i2s_test_t;
+
 typedef struct
 {
     int itf;
@@ -60,6 +62,8 @@ typedef struct
     int ts_evt_en;
     int slot_disable;
     int bypass;
+    uint32_t frame;
+    uint32_t slab;
     union
     {
         struct
@@ -132,6 +136,12 @@ typedef struct
     int bypass;
     int mute_ack;
     void *buffers[2];
+    uint32_t frame;
+    i2s_test_t *test;
+    pi_mem_slab_t slab;
+    struct {
+        int use_slab:1;
+    } flags;
 }
 i2s_slot_test_t;
 
@@ -171,11 +181,15 @@ typedef struct
     uint32_t fifo_id;
     int8_t ws_delay;
     uint32_t incr;
+    uint64_t rx_frames;
+    uint64_t tx_frames;
+    uint64_t rx_slabs;
+    uint64_t tx_slabs;
 }
 i2s_test_config_t;
 
 
-typedef struct 
+typedef struct i2s_test_s
 {
     struct pi_device i2s;
     i2s_slot_test_t slot_test_rx[16];
@@ -194,7 +208,8 @@ pi_device_t *get_testbench();
 pi_device_t *i2s_init(struct pi_device *i2s, i2s_config_t *config);
 int i2s_deinit(struct pi_device *i2s, int itf);
 
-int i2s_slot_init(i2s_slot_test_t *i2s_slot, struct pi_device *i2s, i2s_slot_config_t *config);
+void i2s_slot_new(i2s_slot_test_t *i2s_slot);
+int i2s_slot_init(i2s_test_t *test, i2s_slot_test_t *i2s_slot, struct pi_device *i2s, i2s_slot_config_t *config);
 void i2s_slot_deinit(i2s_slot_test_t *i2s_slot);
 int i2s_slot_start(i2s_slot_test_t *i2s_slot, i2s_slot_start_config_t *config);
 int i2s_slot_stop(i2s_slot_test_t *i2s_slot);
diff --git a/rtos/freeRTOS/vendors/gwt/gap8/pmsis/drivers/udma/i2s/i2s_internal.c b/rtos/freeRTOS/vendors/gwt/gap8/pmsis/drivers/udma/i2s/i2s_internal.c
index cba50c71b..3c1b9bd9a 100644
--- a/rtos/freeRTOS/vendors/gwt/gap8/pmsis/drivers/udma/i2s/i2s_internal.c
+++ b/rtos/freeRTOS/vendors/gwt/gap8/pmsis/drivers/udma/i2s/i2s_internal.c
@@ -162,7 +162,9 @@ static uint8_t __pi_i2s_enqueue(struct i2s_itf_data_s *itf_data)
             itf_data->pending_size = 0;
         }
         /* Enqueue first in HW fifo. */
-        hal_i2s_enqueue(device_id, itf_data->i2s_id, buffer, size_enqueue, itf_data->udma_cfg);
+        I2S_TRACE("i2s(%d): enqueue l2_buf=%lx, size=%ld, cfg=%lx\n",
+               itf_data->i2s_id, buffer, size_enqueue, itf_data->udma_cfg);
+        hal_i2s_enqueue(device_id, itf_data->i2s_id << 4, buffer, size_enqueue, itf_data->udma_cfg);
     }
     return done;
 }
@@ -238,7 +240,7 @@ static void __pi_i2s_clock_enable(struct i2s_itf_data_s *itf_data)
     if ((itf_data->clk & PI_I2S_OPT_EXT_CLK) == 0)
     {
         uint16_t clk_div = __pi_i2s_clk_div(itf_data->frequency);
-        I2S_TRACE("I2S(%d) clk_div = %d\n", device_id, clk_div);
+        I2S_TRACE("I2S(%ld) clk_div = %d\n", device_id, clk_div);
         switch (itf_data->clk & 0x1)
         {
         case 0 :
diff --git a/rtos/freeRTOS/vendors/gwt/libs/include/string.h b/rtos/freeRTOS/vendors/gwt/libs/include/string.h
index 025caa471..51e9afeec 100644
--- a/rtos/freeRTOS/vendors/gwt/libs/include/string.h
+++ b/rtos/freeRTOS/vendors/gwt/libs/include/string.h
@@ -34,6 +34,10 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int32_t memcmp(const void *str1, const void *str2, size_t n);
 
 void *memcpy(void *str1, const void *str2, size_t n);
@@ -60,4 +64,8 @@ size_t strcspn(const char *s, const char *reject);
 
 char *strchr(const char *s, int c);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif  /* __STRING_H__ */
diff --git a/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis.h b/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis.h
index 1ec6ca35b..e554d5e2d 100644
--- a/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis.h
+++ b/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis.h
@@ -72,4 +72,9 @@
 /* Hal includes. */
 #include "pmsis/implem/hal/hal.h"
 
+/* GVSOC proxy includes. */
+#if defined(__PLATFORM_GVSOC__)
+#include "pmsis/platforms/gvsoc.h"
+#endif  /* __PLATFORM_GVSOC__ */
+
 #endif  /* __PMSIS_H__ */
diff --git a/rtos/freeRTOS/vendors/gwt/pmsis/rtos/include/pmsis/rtos/os/os.h b/rtos/freeRTOS/vendors/gwt/pmsis/rtos/include/pmsis/rtos/os/os.h
index b7ae00392..7a0fdae6a 100644
--- a/rtos/freeRTOS/vendors/gwt/pmsis/rtos/include/pmsis/rtos/os/os.h
+++ b/rtos/freeRTOS/vendors/gwt/pmsis/rtos/include/pmsis/rtos/os/os.h
@@ -35,6 +35,10 @@
 #include "pmsis/backend/implementation_specific_defines.h"
 #include "pmsis/backend/pmsis_backend_native_task_api.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /*******************************************************************************
  * Definitions
  ******************************************************************************/
@@ -290,4 +294,8 @@ static inline void pi_os_reboot(void)
 
 #endif  /* __GAP8__ */
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif  /* __PI_RTOS_IMPLEM_OS_H__ */
diff --git a/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk b/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk
index 96a66c543..9a2dfe3a0 100644
--- a/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk
+++ b/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk
@@ -174,7 +174,7 @@ FREERTOS_FLAGS     += -DMAIN_APP_STACK_SIZE=$(MAIN_APP_STACK_SIZE)
 GCC_OPTIM_LEVEL     = -Os	# Optimize for size.
 
 COMMON              = $(RISCV_FLAGS) \
-                      -c -g -ffunction-sections -fdata-sections \
+                      -c -gdwarf-2 -gstrict-dwarf -ffunction-sections -fdata-sections \
                       -fno-delete-null-pointer-checks -fomit-frame-pointer \
                       -fno-tree-loop-distribute-patterns -fno-jump-tables \
                       $(FEATURE_FLAGS) $(FREERTOS_FLAGS)
@@ -345,6 +345,11 @@ OBJS_DEP            = $(patsubst %.o, %.d, $(OBJS))
 APP                ?= test
 BIN                 = $(BUILDDIR)/$(APP)
 
+
+ifneq ($(wsl),)
+WSL_ENV=--wsl=$(wsl)
+endif
+
 # Makefile targets :
 # Build objects (*.o) amd associated dependecies (*.d) with disassembly (*.dump).
 #------------------------------------------
@@ -410,7 +415,7 @@ image: $(BIN)
 	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --image --binary=$(BIN) $(runner_args)
 
 run: $(BIN)
-	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec-prepare --exec --binary=$(BIN) $(runner_args)
+	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec-prepare --exec --binary=$(BIN) $(runner_args) $(WSL_ENV)
 
 traces:
 	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec --binary=$(BIN) --no-run --extend-traces $(runner_args)
diff --git a/rtos/pmsis/pmsis_api/include/pmsis/drivers/i2s.h b/rtos/pmsis/pmsis_api/include/pmsis/drivers/i2s.h
index d043479f6..62c560de7 100644
--- a/rtos/pmsis/pmsis_api/include/pmsis/drivers/i2s.h
+++ b/rtos/pmsis/pmsis_api/include/pmsis/drivers/i2s.h
@@ -19,6 +19,10 @@
 
 #include <stdint.h>
 
+#ifndef PI_INLINE_I2S_LVL_0
+#define PI_INLINE_I2S_LVL_0
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -756,9 +760,30 @@ void pi_i2s_channel_conf_init(struct pi_i2s_channel_conf *conf);
  * @retval 0 If successful.
  * @retval -1 An error occured.
  */
-int pi_i2s_channel_conf_set(struct pi_device *dev, int channel,
+PI_INLINE_I2S_LVL_0 int pi_i2s_channel_conf_set(struct pi_device *dev, int channel,
     struct pi_i2s_channel_conf *conf);
 
+/**
+ * \brief Configure a channel of a frame in TDM mode.
+ *
+ * A frame is a set of channels gathered together so that they can be controlled
+ * all together in terms of data transfer.
+ * This function can be used to configure a channel which is part of a frame.
+ * In TDM mode, the same interface is time-multiplexed to transmit data
+ * for multiple channels, and each channel can have a specific
+ * configuration. This function can be used to give the configuration
+ * of one channel.
+ * @param dev Pointer to the device structure for the driver instance.
+ * @param frame A bitfield containing the channels of the part (one bit per channel).
+ * @param channel ID of the slot, from 0 to the number of channels minus 1.
+ * @param conf A pointer to the I2S channel configuration.
+ *
+ * @retval 0 If successful.
+ * @retval -1 An error occured.
+ */
+PI_INLINE_I2S_LVL_0 int pi_i2s_frame_channel_conf_set(struct pi_device *dev,
+    uint32_t frame, int channel, struct pi_i2s_channel_conf *conf);
+
 
 /**
  * \brief Get the current configuration of a channel in TDM mode.
@@ -899,8 +924,8 @@ int pi_i2s_slots_stop(struct pi_device *dev, uint32_t slots);
  *
  * @retval 0 If successful, -1 if not.
  */
-int pi_i2s_channel_read(struct pi_device *dev, int channel, void **mem_block,
-    size_t *size);
+int pi_i2s_channel_read(struct pi_device *dev, int channel,
+    void **mem_block, size_t *size);
 
 /**
  * @brief Read data asynchronously from the RX queue of a channel in TDM mode.
@@ -929,8 +954,77 @@ int pi_i2s_channel_read(struct pi_device *dev, int channel, void **mem_block,
  *
  * @retval 0 If successful, -1 if not.
  */
-int pi_i2s_channel_read_async(struct pi_device *dev, int channel,
-    pi_task_t *task);
+int pi_i2s_channel_read_async(struct pi_device *dev,
+    int channel, pi_task_t *task);
+
+/**
+ * @brief Read data from the RX queue of a frame channel in TDM mode.
+ *
+ * Data received by the I2S interface is stored in the RX queue consisting
+ * of two memory blocks preallocated by the user and given to the driver in
+ * the configuration. Calling this function will return the next available
+ * buffer to the caller, which has to use it before the sampling for this buffer
+ * starts again.
+ *
+ * The data is read in chunks equal to the size of the memory block.
+ *
+ * This will return data for the specified channel and must only be used in
+ * TDM mode.
+ *
+ * If there is no data in the RX queue the function will block waiting for
+ * the next RX memory block to fill in.
+ *
+ * This function is reading for the whole frame. The returned buffer contains
+ * the samples for the whole frame, but the size is the size of one channel.
+ *
+ * Due to hardware constraints, the address of the buffer must be aligned on
+ * 4 bytes.
+ *
+ * @param dev Pointer to the device structure for the driver instance.
+ * @param frame A bitfield containing the channels of the part (one bit per channel).
+ * @param channel ID of the slot, from 0 to the number of channels minus 1.
+ * @param mem_block Pointer to the variable storing the address of the RX memory
+ *   block containing received data.
+ * @param size Pointer to the variable storing the number of bytes read.
+ *
+ * @retval 0 If successful, -1 if not.
+ */
+PI_INLINE_I2S_LVL_0 int pi_i2s_frame_read(struct pi_device *dev, uint32_t frame,
+    void **mem_block, size_t *size);
+
+/**
+ * @brief Read data asynchronously from the RX queue of a frame channel in TDM mode.
+ *
+ * Data received by the I2S interface is stored in the RX queue consisting
+ * of two memory blocks preallocated by the user and given to the driver in
+ * the configuration. Calling this function will return the next available
+ * buffer to the caller, which has to use it before the sampling for this buffer
+ * starts again.
+ *
+ * The data is read in chunks equal to the size of the memory block.
+ *
+ * This will return data for the specified channel and must only be used in
+ * TDM mode.
+ *
+ * The specified task will be pushed as soon as data is ready in the RX queue,
+ * and the information about the memory block and the size will be available
+ * in the task.
+ *
+ * This function is reading for the whole frame. The returned buffer contains
+ * the samples for the whole frame, but the size is the size of one channel.
+ *
+ * Due to hardware constraints, the address of the buffer must be aligned on
+ * 4 bytes.
+ *
+ * @param dev Pointer to the device structure for the driver instance.
+ * @param frame A bitfield containing the channels of the part (one bit per channel).
+ * @param channel ID of the slot, from 0 to the number of channels minus 1.
+ * @param task        The task used to notify the end of transfer.
+ *
+ * @retval 0 If successful, -1 if not.
+ */
+PI_INLINE_I2S_LVL_0 int pi_i2s_frame_read_async(struct pi_device *dev,
+    uint32_t frame, pi_task_t *task);
 
 /**
  * @brief Read the status of an asynchronous read.
@@ -979,8 +1073,8 @@ int pi_i2s_read_status(pi_task_t *task, void **mem_block, size_t *size);
  * @retval 0 If successful.
  * @retval -1 An error occured.
  */
-int pi_i2s_channel_write(struct pi_device *dev, int channel, void *mem_block,
-    size_t size);
+int pi_i2s_channel_write(struct pi_device *dev, int channel,
+    void *mem_block, size_t size);
 
 /**
  * @brief Write data asynchronously to the TX queue of a channel in TDM mode.
@@ -1013,8 +1107,83 @@ int pi_i2s_channel_write(struct pi_device *dev, int channel, void *mem_block,
  * @retval 0 If successful.
  * @retval -1 An error occured.
  */
-int pi_i2s_channel_write_async(struct pi_device *dev, int channel,
-    void *mem_block, size_t size, pi_task_t *task);
+int pi_i2s_channel_write_async(struct pi_device *dev,
+    int channel, void *mem_block, size_t size, pi_task_t *task);
+
+/**
+ * @brief Write data to the TX queue of a frame channel in TDM mode.
+ *
+ * Data to be sent by the I2S interface is stored first in the TX queue
+ * consisting of memory blocks preallocated by the user with either pingpong
+ * buffers or a memory slab allocator.
+ *
+ * In pingpong mode, the driver will automatically alternate between 2 buffers
+ * and the user code is supposed to call this function to notify the driver
+ * that the specified buffer is ready to be sent. This is used by the driver
+ * to report when an underrun or an overrun occurs.
+ *
+ * In memory slab allocator mode, the user has to allocate buffers from the
+ * memory slab allocator and pass them to the driver by calling this function
+ * when they are ready to be sent.
+ *
+ * This fonction will block until the specified buffer has been transfered.
+ *
+ * This function is writing for the whole frame. The buffer must contain
+ * the samples for the whole frame, but the size is the size of one channel.
+ *
+ * This will sent data to the specified channel and must only be used in
+ * TDM mode.
+ *
+ * @param dev Pointer to the device structure for the driver instance.
+ * @param frame A bitfield containing the channels of the part (one bit per channel).
+ * @param channel ID of the slot, from 0 to the number of channels minus 1.
+ * @param mem_block Pointer to the TX memory block containing data to be sent.
+ * @param size Number of bytes to write. This value has to be equal or smaller
+ *        than the size of the memory block.
+ *
+ * @retval 0 If successful.
+ * @retval -1 An error occured.
+ */
+PI_INLINE_I2S_LVL_0 int pi_i2s_frame_write(struct pi_device *dev, uint32_t frame,
+    void *mem_block, size_t size);
+
+/**
+ * @brief Write data asynchronously to the TX queue of a frame channel in TDM mode.
+ *
+ * Data to be sent by the I2S interface is stored first in the TX queue
+ * consisting of memory blocks preallocated by the user with either pingpong
+ * buffers or a memory slab allocator.
+ *
+ * In pingpong mode, the driver will automatically alternate between 2 buffers
+ * and the user code is supposed to call this function to notify the driver
+ * that the specified buffer is ready to be sent. This is used by the driver
+ * to report when an underrun or an overrun occurs.
+ *
+ * In memory slab allocator mode, the user has to allocate buffers from the
+ * memory slab allocator and pass them to the driver by calling this function
+ * when they are ready to be sent.
+ *
+ * This will sent data to the specified channel and must only be used in
+ * TDM mode.
+ *
+ * This function is writing for the whole frame. The buffer must contain
+ * the samples for the whole frame, but the size is the size of one channel.
+ *
+ * The specified task will be pushed as soon as data is has been transfered.
+ *
+ * @param dev Pointer to the device structure for the driver instance.
+ * @param frame A bitfield containing the channels of the part (one bit per channel).
+ * @param channel ID of the slot, from 0 to the number of channels minus 1.
+ * @param mem_block Pointer to the TX memory block containing data to be sent.
+ * @param size Number of bytes to write. This value has to be equal or smaller
+ *        than the size of the memory block.
+ * @param task        The task used to notify the end of transfer.
+ *
+ * @retval 0 If successful.
+ * @retval -1 An error occured.
+ */
+PI_INLINE_I2S_LVL_0 int pi_i2s_frame_write_async(struct pi_device *dev,
+    uint32_t frame, void *mem_block, size_t size, pi_task_t *task);
 
 /**
  * @brief Read the status of an asynchronous write.
diff --git a/rtos/pulp/pulpos-2/include/pmsis.h b/rtos/pulp/pulpos-2/include/pmsis.h
index 830a86669..d8441d061 100644
--- a/rtos/pulp/pulpos-2/include/pmsis.h
+++ b/rtos/pulp/pulpos-2/include/pmsis.h
@@ -35,6 +35,7 @@
 #define PI_INLINE_CL_TEAM_1 static inline
 #define PI_INLINE_HYPER_LVL_0 static inline
 #define PI_INLINE_OCTOSPI_LVL_0 static inline
+#define PI_INLINE_I2S_LVL_0 static inline
 
 #if defined(__GAP9__)
 #include "pmsis/chips/gap8/perf.h"
@@ -57,6 +58,7 @@
 #include "pmsis/rtos/malloc/cl_l1_malloc.h"
 #include "pmsis/rtos/malloc/l2_malloc.h"
 #include "pmsis/rtos/malloc/fc_l1_malloc.h"
+#include "pmsis/mem_slab.h"
 #include "pmsis/drivers/perf.h"
 #include "pmsis/drivers/hyperbus.h"
 #include "pmsis/drivers/octospi.h"
diff --git a/rtos/pulp/pulpos-2/include/pos/data/data.h b/rtos/pulp/pulpos-2/include/pos/data/data.h
index 9dc3acf5a..0250d1701 100644
--- a/rtos/pulp/pulpos-2/include/pos/data/data.h
+++ b/rtos/pulp/pulpos-2/include/pos/data/data.h
@@ -132,6 +132,15 @@ typedef struct pi_cluster_pe_task_s
     uint8_t nb_done_cores;
 } pi_cluster_pe_task_t;
 
+
+struct pi_mem_slab {
+    uint32_t num_blocks;
+    size_t block_size;
+    char *buffer;
+    char *free_list;
+    uint32_t num_used;
+};
+
 #endif
 
 
diff --git a/rtos/pulp/pulpos-2/include/pos/implem/implem.h b/rtos/pulp/pulpos-2/include/pos/implem/implem.h
index 5484ca05d..ab0701ede 100644
--- a/rtos/pulp/pulpos-2/include/pos/implem/implem.h
+++ b/rtos/pulp/pulpos-2/include/pos/implem/implem.h
@@ -141,6 +141,7 @@ static inline void pmsis_exit(int err)
 #if defined(__GAP9__)
 #include "pos/implem/hyperbus-v2.h"
 #include "pos/implem/octospi-v2.h"
+#include "pos/implem/i2s-v3.h"
 #endif
 #endif
 
diff --git a/rtos/pulp/pulpos-2/rules/pulpos/src.mk b/rtos/pulp/pulpos-2/rules/pulpos/src.mk
index 09d748f17..3726fe2f0 100644
--- a/rtos/pulp/pulpos-2/rules/pulpos/src.mk
+++ b/rtos/pulp/pulpos-2/rules/pulpos/src.mk
@@ -20,7 +20,7 @@ endif
 
 ifdef CONFIG_KERNEL
 PULP_SRCS += kernel/init.c kernel/kernel.c kernel/device.c kernel/task.c kernel/alloc.c \
-	kernel/alloc_pool.c kernel/irq.c kernel/soc_event.c kernel/log.c kernel/time.c
+	kernel/alloc_pool.c kernel/irq.c kernel/soc_event.c kernel/log.c kernel/time.c kernel/mem_slab.c
 
 PULP_ASM_SRCS += kernel/irq_asm.S kernel/time_asm.S
 
diff --git a/testplan.cfg b/testplan.cfg
index e65b66319..181e6384c 100644
--- a/testplan.cfg
+++ b/testplan.cfg
@@ -24,6 +24,8 @@ i2s.add_feature('Slot dynamic reconfiguration')
 i2s.add_feature('Cross-interface synchronization')
 i2s.add_feature('PDM SFU connection')
 i2s.add_feature('UDMA core PCM connection')
+i2s.add_feature('Memory slab allocators')
+i2s.add_feature('Channel frames')
 
 sfu = testplan.add_category('udma:sfu')
 sfu.add_feature('MEM_IN - Memory input')
diff --git a/tools/autotiler_v3/CNN_Generators/CNN_Copy_Generators.c b/tools/autotiler_v3/CNN_Generators/CNN_Copy_Generators.c
index 1af0778c6..a6030494d 100644
--- a/tools/autotiler_v3/CNN_Generators/CNN_Copy_Generators.c
+++ b/tools/autotiler_v3/CNN_Generators/CNN_Copy_Generators.c
@@ -336,6 +336,78 @@ void LoadCNN_Copy_Library()
                         TCArg("signed char *__restrict__", "Infos")
                         )
         );
+        LibKernelTemplate("CNN_Float32Fp_T",
+                        CArgs(5,
+                        TCArg("float *__restrict__", "In"),
+                        TCArg("signed short *__restrict__", "Out"),
+                        TCArg("unsigned short int", "W"),
+                        TCArg("unsigned short int", "H"),
+                        TCArg("signed char *__restrict__", "Infos")
+                        )
+        );
+        LibKernelTemplate("CNN_Float32UFp_T",
+                        CArgs(5,
+                        TCArg("float *__restrict__", "In"),
+                        TCArg("unsigned short *__restrict__", "Out"),
+                        TCArg("unsigned short int", "W"),
+                        TCArg("unsigned short int", "H"),
+                        TCArg("signed char *__restrict__", "Infos")
+                        )
+        );
+        LibKernelTemplate("CNN_Float32Fps_T",
+                        CArgs(5,
+                        TCArg("float *__restrict__", "In"),
+                        TCArg("signed char *__restrict__", "Out"),
+                        TCArg("unsigned short int", "W"),
+                        TCArg("unsigned short int", "H"),
+                        TCArg("signed char *__restrict__", "Infos")
+                        )
+        );
+        LibKernelTemplate("CNN_Float32UFps_T",
+                        CArgs(5,
+                        TCArg("float *__restrict__", "In"),
+                        TCArg("unsigned char *__restrict__", "Out"),
+                        TCArg("unsigned short int", "W"),
+                        TCArg("unsigned short int", "H"),
+                        TCArg("signed char *__restrict__", "Infos")
+                        )
+        );
+        LibKernelTemplate("CNN_FpFloat32_T",
+                        CArgs(5,
+                        TCArg("signed short *__restrict__", "In"),
+                        TCArg("float *__restrict__", "Out"),
+                        TCArg("unsigned short int", "W"),
+                        TCArg("unsigned short int", "H"),
+                        TCArg("signed char *__restrict__", "Infos")
+                        )
+        );
+        LibKernelTemplate("CNN_UFpFloat32_T",
+                        CArgs(5,
+                        TCArg("unsigned short *__restrict__", "In"),
+                        TCArg("float *__restrict__", "Out"),
+                        TCArg("unsigned short int", "W"),
+                        TCArg("unsigned short int", "H"),
+                        TCArg("signed char *__restrict__", "Infos")
+                        )
+        );
+        LibKernelTemplate("CNN_FpsFloat32_T",
+                        CArgs(5,
+                        TCArg("signed char *__restrict__", "In"),
+                        TCArg("float *__restrict__", "Out"),
+                        TCArg("unsigned short int", "W"),
+                        TCArg("unsigned short int", "H"),
+                        TCArg("signed char *__restrict__", "Infos")
+                        )
+        );
+        LibKernelTemplate("CNN_UFpsFloat32_T",
+                        CArgs(5,
+                        TCArg("unsigned char *__restrict__", "In"),
+                        TCArg("float *__restrict__", "Out"),
+                        TCArg("unsigned short int", "W"),
+                        TCArg("unsigned short int", "H"),
+                        TCArg("signed char *__restrict__", "Infos")
+                        )
+        );
 	LibKernelTemplate("KerMatTranspose_fps_T",
 		CArgs(7,
 			TCArg("signed char *__restrict__", "In"),
@@ -506,6 +578,16 @@ void LoadCNN_Copy_Library()
 	LibKernel("CNN_FpsFloat16", CALL_PARALLEL, 0, "CNN_FpsFloat16_T",	CNN_Match(CNN_OperList(1, KOP_CONVERT_FP_FL),                  0, 1, CNN_Type(1,0,0,0,2), 0,0,0,0,1,1));
 	LibKernel("CNN_UFpsFloat16", CALL_PARALLEL, 0, "CNN_UFpsFloat16_T",	CNN_Match(CNN_OperList(1, KOP_CONVERT_FP_FL),                  0, 1, CNN_Type(-1,0,0,0,2), 0,0,0,0,1,1));
 
+        LibKernel("CNN_Float32Fp", CALL_PARALLEL, 0, "CNN_Float32Fp_T",         CNN_Match(CNN_OperList(1, KOP_CONVERT_FL_FP),                  0, 1, CNN_Type(4,0,0,0,2), 0,0,0,0,1,1));
+        LibKernel("CNN_Float32UFp", CALL_PARALLEL, 0, "CNN_Float32UFp_T",       CNN_Match(CNN_OperList(1, KOP_CONVERT_FL_FP),                  0, 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,1,1));
+        LibKernel("CNN_Float32Fps", CALL_PARALLEL, 0, "CNN_Float32Fps_T",       CNN_Match(CNN_OperList(1, KOP_CONVERT_FL_FP),                  0, 1, CNN_Type(4,0,0,0,1), 0,0,0,0,1,1));
+        LibKernel("CNN_Float32UFps", CALL_PARALLEL, 0, "CNN_Float32UFps_T",     CNN_Match(CNN_OperList(1, KOP_CONVERT_FL_FP),                  0, 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,1,1));
+
+        LibKernel("CNN_FpFloat32", CALL_PARALLEL, 0, "CNN_FpFloat32_T",         CNN_Match(CNN_OperList(1, KOP_CONVERT_FP_FL),                  0, 1, CNN_Type(2,0,0,0,4), 0,0,0,0,1,1));
+        LibKernel("CNN_UFpFloat32", CALL_PARALLEL, 0, "CNN_UFpFloat32_T",       CNN_Match(CNN_OperList(1, KOP_CONVERT_FP_FL),                  0, 1, CNN_Type(-2,0,0,0,4), 0,0,0,0,1,1));
+        LibKernel("CNN_FpsFloat32", CALL_PARALLEL, 0, "CNN_FpsFloat32_T",       CNN_Match(CNN_OperList(1, KOP_CONVERT_FP_FL),                  0, 1, CNN_Type(1,0,0,0,4), 0,0,0,0,1,1));
+        LibKernel("CNN_UFpsFloat32", CALL_PARALLEL, 0, "CNN_UFpsFloat32_T",     CNN_Match(CNN_OperList(1, KOP_CONVERT_FP_FL),                  0, 1, CNN_Type(-1,0,0,0,4), 0,0,0,0,1,1));
+
 	/****************************************************************************************************************/
 	/* Kernels tensor/matrix transposes and permutations  */
 	/****************************************************************************************************************/
@@ -1200,7 +1282,7 @@ int CNN_Convert(
 	PKerArgs[0] = KerArg("In",   KerArgSpace(1,D0), O_IN|O_DB,            1, 1, In_DataSize,  0, 0, 0, "In");
 	PKerArgs[1] = KerArg("Out",  KerArgSpace(1,D0), O_OUT|O_DB,           1, 1, Out_DataSize,  0, 0, 0, "Out");
         if (kop!=KOP_CONVERT_FP_FP)
-        	PKerArgs[2] = KerArg("Infos",KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED, 1, 1, 7,  0, 0, 0, "Infos");
+        	PKerArgs[2] = KerArg("Infos",KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED, 1, 1, 8,  0, 0, 0, "Infos");
 	Kernel_T *Kernel = UserKernel(Name,
 				KernelIterSpace(2, IterTiledSpace(T0), IterParSpace(D0, Sz, 8)),
 				TILE_HOR,
@@ -1225,10 +1307,10 @@ int CNN_Convert(
 	if (Kernel) {
 		AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0);
 		AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0);
-		AddKernelArgDim(Name, "In",    3, 1, Sz, In_DataSize);
-		AddKernelArgDim(Name, "Out",   3, 1, Sz, Out_DataSize);
+		In_Float? AddKernelFloatArgDim(Name, "In",    3, 1, Sz, In_DataSize) :AddKernelArgDim(Name, "In",    3, 1, Sz, In_DataSize);
+                Out_Float?AddKernelFloatArgDim(Name, "Out",   3, 1, Sz, Out_DataSize):AddKernelArgDim(Name, "Out",   3, 1, Sz, Out_DataSize);
                 if (kop!=KOP_CONVERT_FP_FP)
-	        	AddKernelArgDim(Name, "Infos", 2, 1, 1);
+	        	AddKernelArgDim(Name, "Infos", 2, 8, 1);
 	}
 	return (Kernel!=0);
 }
diff --git a/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c b/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c
index bc010a175..f4a50abbf 100644
--- a/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c
+++ b/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c
@@ -134,7 +134,7 @@ void LoadCNN_NE16_SQ8_Library()
                         TCArg("unsigned char",                "Dy")
                         )
         );
-        LibKernelTemplate("Ker_MM_Conv_NE16_T",
+        LibKernelTemplate("KerConv_MM_NE16_T",
                   CArgs(28,
                         TCArg("unsigned char * __restrict__", "In"),
                         TCArg("unsigned char * __restrict__", "ColBuff"),
@@ -239,13 +239,13 @@ void LoadCNN_NE16_SQ8_Library()
                 CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, -1, CNN_Type(0,0,0,0,4), 3,3,1,1,1,1));
         LibKernel("KerConvDW3x3Stride2_NE16",   CALL_SEQUENTIAL_STRUCT|CALL_NE16_KER, 0, "KerConv_NE16_T",
                 CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, -1, CNN_Type(0,0,0,0,4), 3,3,1,1,2,2));
+        LibKernel("Ker_MM_Conv2D_NE16",     CALL_PARALLEL_CC|CALL_NE16_KER, 0, "KerConv_MM_NE16_T",
+                CNN_Match(CNN_OperList(1, KOP_MM_CONV), 0, -1, CNN_Type(0,0,0,0,4), -1,-1, 1, 1,-1,-1));
 
         LibKernel("KerLinear_8a_NE16",          CALL_SEQUENTIAL_STRUCT|CALL_NE16_KER, 0, "KerLinear_NE16_T",
                 CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), -1, CNN_Type(1,0,4,0,0), 0,0,0,0,0,0));
         LibKernel("KerLinear_16a_NE16",         CALL_SEQUENTIAL_STRUCT|CALL_NE16_KER, 0, "KerLinear_NE16_T",
                 CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), -1, CNN_Type(2,0,4,0,0), 0,0,0,0,0,0));
-        LibKernel("Ker_MM_Conv2D_NE16",         CALL_PARALLEL_CC|CALL_NE16_KER,       0, "Ker_MM_Conv_NE16_T",
-                CNN_Match(CNN_OperList(1, KOP_MM_CONV), 0, -1, CNN_Type(1,1,0,0,4), -1,-1,1,1,-1,-1));
 
         LibKernel("KerMatMul_8a_NE16",          CALL_PARALLEL_CC|CALL_NE16_KER, 0, "KerMatMul_NE16_T",
                 CNN_Match(CNN_OperList(1, KOP_MATMUL_TRANSPOSED), CNN_OperList(1, KOP_NONE), 0, CNN_Type(1,0,4,0,0), 0,0,0,0,0,0));
@@ -259,7 +259,7 @@ void LoadCNN_NE16_SQ8_Library()
 }
 
 
-int CNN_MM_ConvolutionNE16(
+Kernel_T *CNN_MM_ConvolutionNE16(
         char         *Name,
 
         CNN_GenControl_T *Ctrl,
@@ -335,7 +335,7 @@ int CNN_MM_ConvolutionNE16(
         if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH))
                 GenTilingError("CNN_MM_ConvolutionNE16 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_RELUM, KOP_HSIGMOID, KOP_HSWISH, KOP_LEAKYRELU, KOP_SIGMOID or KOP_TANH", Name);
 
-        Wa |= O_NE16_RNN;
+        Wa |= O_NE16_LIN | O_LINEAR;
         /* When there is a special activation (not supported by the accelerator itself), you need to streamout 32bits and do the act in the cluster but the ((*S) >> N) is done in the accelerator (KOP_DP_REDUCT_NOSCALE) */
         int NeedReductNoScale = !(ActOper == KOP_RELU || ActOper == KOP_NONE);
         /* Also when in/out are 16bits you need to streamout 32bits but here the reduction step will be done in the cluster (KOP_DP_REDUCT) */
@@ -365,7 +365,7 @@ int CNN_MM_ConvolutionNE16(
 
         /* Im2Col Size is aligned to 16bits for linear padding */
         int Im2ColSize = InFeat*Fcx*Fcy;
-        int WBuffSize = ALIGN(Im2ColSize, 4);
+        int WBuffSize = Im2ColSize;
         int BuffS = 2*ALIGN(Im2ColSize, 4);
 
         /* Layer number of operations and memory bandwidth requirements */
@@ -432,19 +432,18 @@ int CNN_MM_ConvolutionNE16(
         unsigned int DEFAULT_NE16_JOB_CFG = NE16_DefaultConfig(Filter_DataSizeBits, Mode16, StreamoutMode, FilterMode, LinearMode, StridedMode, NormBits, Streamin, \
                                                                WOffsetCfg, QuantRightShift, QuantBits, QuantNoRect, NormShift, NormBias);
 
-        int Ca=0;
-        KCArgs[Ca++] = TCArg(In_DataSize>0?CNN_ArgDataType(In_DataSize,1,1):CNN_ArgDataTypeUns(-In_DataSize,1,1), "In");
-        KCArgs[Ca++] = TCArg(CNN_ArgDataTypeUns(1,         1,1), "Filter"); // int16_t for 16 chin contributions
-        KCArgs[Ca++] = TCArg(CNN_ArgDataType(Bias_DataSize,1,1),   "Bias");
-        KCArgs[Ca++] = TCArg(Out_DataSize>0?CNN_ArgDataType(Out_DataSize,1,1):CNN_ArgDataTypeUns(-Out_DataSize,1,1), "Out");
-        KCArgs[Ca++] = TCArg(CNN_ArgDataTypeUns(1,         1,1),  "Scale");
-        KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,            1,1), "ScaleN");
-        KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,            1,1),  "Infos");
-
         Kernel = UserKernel(Name,
-                KernelIterSpace(3, IterParSpace(D1, OutFeat, OutTileCons), IterTiledSpace(T0), IterParSpace(D0, InFeat, InTileCons)),
+                KernelIterSpace(3, IterParSpace(D1, OutFeat, OutTileCons), IterTiledSpace(T0), IterParSpace(D0|SPACE_PROP_ONE_TILE, InFeat, InTileCons)),
                 TILE_HOR|TILE_HWC,
-                KCArgs,
+                CArgs(7,
+                        TCArg(In_DataSize>0?CNN_ArgDataType(In_DataSize,1,1):CNN_ArgDataTypeUns(-In_DataSize,1,1), "In"),
+                        TCArg(CNN_ArgDataTypeUns(1,         1,1), "Filter"), // int16_t for 16 chin contribution
+                        TCArg(CNN_ArgDataType(Bias_DataSize,1,1),   "Bias"),
+                        TCArg(Out_DataSize>0?CNN_ArgDataType(Out_DataSize,1,1):CNN_ArgDataTypeUns(-Out_DataSize,1,1), "Out"),
+                        TCArg(CNN_ArgDataTypeUns(1,         1,1),  "Scale"),
+                        TCArg(CNN_ArgDataType(1,            1,1), "ScaleN"),
+                        TCArg(CNN_ArgDataType(1,            1,1),  "Infos")
+                ),
                 Calls(6,
                         Call("NE16_Enable", LOC_D1_PROLOG, Bindings(0)),
                         Call("NE16_SoftReset", LOC_D0, Bindings(0)),
@@ -544,7 +543,7 @@ int CNN_MM_ConvolutionNE16(
                                       PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PadInp,
                                       ActOper);
         }
-        return (Kernel!=0);
+        return Kernel;
 }
 
 static Kernel_T *CNN_ConvolutionNE16_Internal(
@@ -604,6 +603,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
                 Fpx=1; Dpx=1; Spx=1; Fpy=1; Dpy=1; Spy=1;
         }
         if (Ctrl) {
+                if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER;
                 if (Ctrl->PadType != -1) PadType = Ctrl->PadType;
         }
         int OverlapC, OverlapP;
@@ -680,7 +680,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
         LayerBandwidth += (Fcx*Fcy*Filter_DataSizeBits*InFeat*(DWConv?1:OutFeat)+7)/8;
         LayerBandwidth += Bias_DataSize*OutFeat;
 
-        if (ConvOper == KOP_CONV && Height == 1 && Fcx != 1 && Fcy == 1) ConvOper = KOP_CONV1D;
+        if (ConvOper == KOP_CONV && Height == 1 && Fcy == 1) ConvOper = KOP_CONV1D;
         ConvKerName = CNN_FindMatchingKernelAttr(ConvOper, KOP_NONE, ParFeat, CALL_NE16_KER, Abs(In_DataSize), Abs(Out_DataSize), Bias_DataSize, 0, 4, Fcx, Fcy, Dcx, Dcy, Scx, Scy,
                                                  &NeedFcx, &NeedFcy, &NeedDcx, &NeedDcy, &NeedScx, &NeedScy, 0);
         if (ConvKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Convolution basic kernel", Name);
@@ -778,7 +778,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
                 (DWConv?
                 KernelIterSpace(2, IterParSpace(D0|InFeatProp, InFeat, InTileCons), IterTiledSpace(T0)):
                 KernelIterSpace(3, IterParSpace(D1, OutFeat, OutTileCons), IterTiledSpace(T0), IterParSpace(D0|InFeatProp, InFeat, InTileCons))),
-                TILE_HOR|TILE_HWC,
+                TileOrientation|TILE_HWC,
                 KCArgs,
                 Calls(10,
                         Call("NE16_Enable", DWConv?LOC_D0_PROLOG:LOC_D1_PROLOG, Bindings(0)),
@@ -1122,13 +1122,13 @@ static Kernel_T *CNN_LinearAct_NE16_Internal(
 
         LinearKerName = CNN_FindMatchingKernelAttr(LinearOper, KOP_NONE, 0, CALL_NE16_KER, Abs(In_DataSize), 0, Bias_DataSize, 0,0,  0,0,0,0,0,0, 0,0,0,0,0,0, 0);
         if (LinearKerName==0) GenTilingError("CNN_LinearAct_NE16 Kernel: %s, Can't find a matching Linear basic kernel: %d %d", Name, Abs(In_DataSize), Bias_DataSize);
-        if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU))
+        if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_HSIGMOID || ActOper == KOP_SIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_TANH))
                 GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_RELUM, KOP_HSIGMOID, KOP_HSWISH or KOP_LEAKYRELU", Name);
 
         /* Also when in/out are 16bits you need to streamout 32bits but here the reduction step will be done in the cluster (KOP_DP_REDUCT) */
         int NeedReductScale = Mode16; //Abs(Out_DataSize) == 2;
         int NeedSetBias = Mode16;
-        int NeedReductNoScale = ((InTileCons < InDim) && !InFeatOneTile) || NeedSetBias;
+        int NeedReductNoScale = ((InTileCons < InDim) && !InFeatOneTile) || NeedSetBias || (!(ActOper == KOP_RELU || ActOper == KOP_NONE));
         int NeedReduct = NeedReductNoScale || NeedReductScale;
 
         int NeedLinOut = NeedReduct || NeedSetBias;
@@ -1173,32 +1173,18 @@ static Kernel_T *CNN_LinearAct_NE16_Internal(
         }
         Kernel_T *Kernel;
 
-        CKernel_Arg_T **KCArgs = AllocateCArgs(7);
-        int Ca=0;
-        KCArgs[Ca++] = TCArg(In_DataSize>0?CNN_ArgDataType(In_DataSize,1,1):CNN_ArgDataTypeUns(-In_DataSize,1,1), "In");
-        KCArgs[Ca++] = TCArg(CNN_ArgDataTypeUns(1,1,1), "Filter");
-        KCArgs[Ca++] = TCArg(CNN_ArgDataType(Bs,1,1),   "Bias");
-        KCArgs[Ca++] = TCArg(Out_DataSize>0?CNN_ArgDataType(Out_DataSize,1,1):CNN_ArgDataTypeUns(-Out_DataSize,1,1), "Out");
-        KCArgs[Ca++] = TCArg(CNN_ArgDataTypeUns(1,1,1), "Scale");
-        KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1),    "ScaleN");
-        KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,1,1),    "Infos");
-
-        Object_T **KArgs = AllocateKerArgs(7+NeedLinOut);
-        int Ka=0;
-        KArgs[Ka++] = KerArg("In",      KerArgSpace(1,D0|SPACE_PROP_PAD2PREF),   OBJ_IN_DB,            1, 1,  Abs(In_DataSize), 0, 0, 0, "In");
-        KArgs[Ka++] = KerArg("Filter",  KerArgSpace(2,D1,D0|Wp), OBJ_IN_DB|O_CONST|Wa, 1, 1,  Ws,           0, 0, 0, "Filter");
-        KArgs[Ka++] = KerArg("Bias",    KerArgSpace(1,D1),   OBJ_IN_DB|O_CONST,    1, 1,  Bs,       0, 0, 0, "Bias");
-        if (NeedLinOut)
-        KArgs[Ka++] = KerArg("LinOut",  KerArgSpace(1,D1),   O_BUFF|O_ONETILE,     1, 1,  Ls,           0, 0, 0, "");
-        KArgs[Ka++] = KerArg("Out",     KerArgSpace(1,D1),   OBJ_OUT_DB,           1, 1,  Abs(Out_DataSize), 0, 0, 0, "Out");
-        KArgs[Ka++] = KerArg("Scale",   KerArgSpace(1,D1),   OBJ_IN_DB|O_CONST,    1, 1,  1,            0, 0, 0, "Scale");
-        KArgs[Ka++] = KerArg("ScaleN",  KerArgSpace(1,D1),   OBJ_IN_DB|O_CONST,    1, 1,  1,            0, 0, 0, "ScaleN");
-        KArgs[Ka++] = KerArg("Infos",   KerArgSpace(1,T0),   O_IN|O_BUFF|O_NTILED, 1, 1,  AT_INF_NE16_DIM*1, 0, 0, 0, "Infos");
-
         Kernel = UserKernel(Name,
                 KernelIterSpace(3, IterTiledSpace(T0), IterParSpace(D1, OutDim, OutTileCons), IterParSpace(D0|InFeatProp, InDim, InTileCons)),
                 TileOrientation,
-                KCArgs,
+                CArgs(7,
+                        TCArg(In_DataSize>0?CNN_ArgDataType(In_DataSize,1,1):CNN_ArgDataTypeUns(-In_DataSize,1,1), "In"),
+                        TCArg(CNN_ArgDataTypeUns(1,1,1), "Filter"),
+                        TCArg(CNN_ArgDataType(Bs,1,1),   "Bias"),
+                        TCArg(Out_DataSize>0?CNN_ArgDataType(Out_DataSize,1,1):CNN_ArgDataTypeUns(-Out_DataSize,1,1), "Out"),
+                        TCArg(CNN_ArgDataTypeUns(1,1,1), "Scale"),
+                        TCArg(CNN_ArgDataType(1,1,1),    "ScaleN"),
+                        TCArg(CNN_ArgDataType(1,1,1),    "Infos")
+                ),
                 Calls(6,
                         Call("NE16_Enable", LOC_PROLOG, Bindings(0)),
                         SetBiasKerName?Call(SetBiasKerName, LOC_D0_PROLOG,
@@ -1244,7 +1230,16 @@ static Kernel_T *CNN_LinearAct_NE16_Internal(
                         ),
                         Call("NE16_Disable", LOC_EPILOG, Bindings(0))
                 ),
-                KArgs
+                KerArgs(8,
+                        KerArg("In",      KerArgSpace(1,D0|SPACE_PROP_PAD2PREF),OBJ_IN_DB,            1, 1,  Abs(In_DataSize),  0, 0, 0, "In"),
+                        KerArg("Filter",  KerArgSpace(2,D1,D0|Wp),              OBJ_IN_DB|O_CONST|Wa, 1, 1,  Ws,                0, 0, 0, "Filter"),
+                        KerArg("Bias",    KerArgSpace(1,D1),                    OBJ_IN_DB|O_CONST,    1, 1,  Bs,                0, 0, 0, "Bias"),
+             NeedLinOut?KerArg("LinOut",  KerArgSpace(1,D1),                    O_BUFF|O_ONETILE,     1, 1,  Ls,                0, 0, 0, ""):AT_NO_KER_ARG,
+                        KerArg("Out",     KerArgSpace(1,D1),                    OBJ_OUT_DB,           1, 1,  Abs(Out_DataSize), 0, 0, 0, "Out"),
+                        KerArg("Scale",   KerArgSpace(1,D1),                    OBJ_IN_DB|O_CONST,    1, 1,  1,                 0, 0, 0, "Scale"),
+                        KerArg("ScaleN",  KerArgSpace(1,D1),                    OBJ_IN_DB|O_CONST,    1, 1,  1,                 0, 0, 0, "ScaleN"),
+                        KerArg("Infos",   KerArgSpace(1,T0),                    O_IN|O_BUFF|O_NTILED, 1, 1,  AT_INF_NE16_DIM*1, 0, 0, 0, "Infos")
+                )
         );
         if (Kernel) {
                 AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0);
@@ -1385,6 +1380,8 @@ int CNN_MatMulAct_NE16(
         )
 
 {
+        return CNN_ConvolutionNE16(Name, Ctrl, In_DataSize, Out_DataSize, Bias_DataSize, 1, Filter_DataSizeBits, ColM1, ColM2, LineM1, 1, KOP_CONV,1,1,1,1,1,1,0,0, KOP_NONE,0,0,0,0,0,0,0, ActOper);
+#if 0
         int Log=1;
         if (Abs(In_DataSize)!=1 && Abs(In_DataSize)!=2) GenTilingError("Node: %s Input DataSize %d not supported in NE16", Name, In_DataSize);
 
@@ -1555,4 +1552,5 @@ int CNN_MatMulAct_NE16(
                 AddKernelArgDim(Name, "Infos", 2, AT_INF_DIM, 1);
         }
         return Kernel!=0;
+#endif
 }
\ No newline at end of file
diff --git a/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.h b/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.h
index ac6285a00..b67494872 100644
--- a/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.h
+++ b/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.h
@@ -59,7 +59,7 @@ int CNN_ConvolutionNE16(
         KernelOper_T ActOper
         );
 
-int CNN_MM_ConvolutionNE16(
+Kernel_T *CNN_MM_ConvolutionNE16(
         char         *Name,
 
         CNN_GenControl_T *Ctrl,
diff --git a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c
index edc19bab0..99d97952d 100644
--- a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c
+++ b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c
@@ -730,6 +730,10 @@ void LoadCNN_SQ8_Library()
 											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
 	LibKernel("KerPar_MM_Conv1x1_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
 											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv1x1_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
+											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv1x1_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
+											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
 	LibKernel("KerPar_MM_Conv1D_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
 											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
 	LibKernel("KerPar_MM_Conv1D_DxDy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
@@ -1140,7 +1144,7 @@ void LoadCNN_SQ8_Library()
 
 *********************************************************************************************************************************************************************/
 
-int CNN_MM_ConvolutionPoolAct_SQ8(
+static Kernel_T *CNN_MM_ConvolutionPoolAct_SQ8_Internal(
 	char         *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -1177,9 +1181,9 @@ int CNN_MM_ConvolutionPoolAct_SQ8(
 {
 	if (ConvOper==KOP_NONE) {
 		if (PoolOper!=KOP_NONE)
-			return CNN_PoolAct_SQ8(Name, Ctrl, InFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+			return CNN_PoolAct_SQ8_Internal(Name, Ctrl, InFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
 		else if (ActOper!=KOP_NONE)
-			return CNN_Act_SQ8(Name, Ctrl, InFeat, Width, Height, ActOper);
+			return CNN_Act_SQ8_Internal(Name, Ctrl, InFeat, Width, Height, ActOper);
 		else GenTilingError("CNN_MM_ConvolutionPoolAct_SQ8 Kernel: %s, All requested operations are KOP_NONE", Name);
 	}
 
@@ -1197,6 +1201,10 @@ int CNN_MM_ConvolutionPoolAct_SQ8(
 		if (Ctrl->HWC) HWC = 1;
 		if (Ctrl->ParallelFeatures != -1) ParFeatConv = Ctrl->ParallelFeatures;
 	}
+
+	if (HWC && Fcy==1 && Fcx==1 && Scy==1 && Scx==1 && Dcy==1 && Dcx==1)
+		return CNN_MatMulAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, Height*Width, OutFeat, InFeat, 0,0,0,0, KOP_MATMUL_TRANSPOSED, ActOper, 0);
+
 	if (ParFeatConv == 2 && HWC && Fcy>1 && (InFeat < 8))
 		ParFeatConv = 0;
 	else
@@ -1253,7 +1261,7 @@ int CNN_MM_ConvolutionPoolAct_SQ8(
 	int BuffS = ALIGN(InFeat*Fcx*Fcy, 3);
 	if (HWC) {
 		if (Fcx==1&&Fcy==1) BuffS = 1;
-		else if (ParFeatConv) BuffS = ALIGN(InFeat*Fcx*Fcy, 3);
+		else if (ParFeatConv) BuffS = 2*ALIGN(InFeat*Fcx*Fcy, 3);
 		else BuffS = 2 * InFeat*Fcx*Fcy*8;
 	}
 
@@ -1453,11 +1461,11 @@ int CNN_MM_ConvolutionPoolAct_SQ8(
 				      PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PadInp,
 				      ActOper);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 
-int CNN_HWC_DWConvolutionPoolAct_SQ8(
+static Kernel_T *CNN_HWC_DWConvolutionPoolAct_SQ8_Internal(
 	char         *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -1494,9 +1502,9 @@ int CNN_HWC_DWConvolutionPoolAct_SQ8(
 {
 	if (ConvOper==KOP_NONE) {
 		if (PoolOper!=KOP_NONE)
-			return CNN_PoolAct_SQ8(Name, Ctrl, InFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+			return CNN_PoolAct_SQ8_Internal(Name, Ctrl, InFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
 		else if (ActOper!=KOP_NONE)
-			return CNN_Act_SQ8(Name, Ctrl, InFeat, Width, Height, ActOper);
+			return CNN_Act_SQ8_Internal(Name, Ctrl, InFeat, Width, Height, ActOper);
 		else GenTilingError("CNN_HWC_DWConvolutionPoolAct_SQ8: %s, All requested operations are KOP_NONE", Name);
 	}
 
@@ -1606,7 +1614,7 @@ int CNN_HWC_DWConvolutionPoolAct_SQ8(
 	}
 
 	if (Log) {
-		printf("InFeat: %d, OutFeat: %d%s\n", InFeat, OutFeat, HWC?", HWC":", CHW");
+		printf("InFeat: %d, OutFeat: %d%s - TileOrientation: %s\n", InFeat, OutFeat, HWC?", HWC":", CHW", TileOrientation==TILE_HOR?"TILE_HOR":"TILE_VER");
         	printf("Conv => W:  %d, Pad:[%d,%d] PadT:[%d,%d] => Wc: %d, Filter:[%d,%d]\n", Width,  PadInc[0], PadInc[1], PadIncT[0], PadIncT[1], Wc, Fcx, Fcy);
         	printf("     => H:  %d, Pad:[%d,%d] PadT:[%d,%d] => Hc: %d\n", Height, PadInc[2], PadInc[3], PadIncT[2], PadIncT[3], Hc);
         	printf("Pool => Wc: %d, Pad:[%d,%d] => Wo: %d, Filter:[%d,%d]\n", Wc, PadInp[0], PadInp[1], Wo, Fpx, Fpy);
@@ -1744,7 +1752,7 @@ int CNN_HWC_DWConvolutionPoolAct_SQ8(
 				      PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PadInp,
 				      ActOper);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -1790,7 +1798,7 @@ int CNN_HWC_DWConvolutionPoolAct_SQ8(
 	
 *********************************************************************************************************************************************************************/
 
-int CNN_ConvolutionPoolAct_SQ8(
+Kernel_T *CNN_ConvolutionPoolAct_SQ8_Internal(
 	char         *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -1850,31 +1858,31 @@ int CNN_ConvolutionPoolAct_SQ8(
         }
 	if (ConvOper==KOP_NONE) {
 		if (PoolOper!=KOP_NONE)
-			return CNN_PoolAct_SQ8(Name, Ctrl, InFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+			return CNN_PoolAct_SQ8_Internal(Name, Ctrl, InFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
 		else if (ActOper!=KOP_NONE)
-			return CNN_Act_SQ8(Name, Ctrl, InFeat, Width, Height, ActOper);
+			return CNN_Act_SQ8_Internal(Name, Ctrl, InFeat, Width, Height, ActOper);
 		else GenTilingError("CNN_ConvolutionPoolAct_SQ8 Kernel: %s, All requested operations are KOP_NONE", Name);
 	} else if (HWC) {
 		if (ConvOper == KOP_CONV_DW)
-			return CNN_HWC_DWConvolutionPoolAct_SQ8(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height,
+			return CNN_HWC_DWConvolutionPoolAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height,
 								ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad,
 								PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
 		else
-			return CNN_MM_ConvolutionPoolAct_SQ8(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height,
+			return CNN_MM_ConvolutionPoolAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height,
 							     ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad,
 							     PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
 	} else if (ConvOper==KOP_CONV && ((Fcx > 1 && Fcy == 1))) {
-		AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
-		int Ok = CNN_MM_ConvolutionPoolAct_SQ8(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height,
+		// AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+		Kernel_T *Ok = CNN_MM_ConvolutionPoolAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height,
 						       ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad,
 				      		       PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
-		AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
-		if (Ok) return Ok;
+		// AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+		if (Ok!=0) return Ok;
 		if (Log) printf("No solution found for im2col scheme, reverting to standard implementation\n");
 	}
 	if (Fcx==1 && Fcy==1 && Scx==1 && Scy==1 && Dcx==1 && Dcy==1 && Height==1 && Width==1) {
 		printf("This is a pointwise on 1x1 input --> Mapping to CNN_Linear_NE16\n");
-		return CNN_LinearAct_SQ8(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, KOP_LINEAR, ActOper);
+		return CNN_LinearAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, KOP_LINEAR, ActOper);
 	}
 
 	if (PoolOper==KOP_NONE) {
@@ -1997,12 +2005,12 @@ int CNN_ConvolutionPoolAct_SQ8(
 		// if ((InFeat+OutFeat)<80) {
 		if ((InFeat+OutFeat)<100 && (Scx==1) && (Scy==1)) {
 			if (Log) printf("Mapping this convolution to matrix multiplication with small first operand\n");
-			int Ok = CNN_MatMulSmallM1Act_SQ8(Name, 0, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL_SM1, ActOper);
+			Kernel_T *Ok = CNN_MatMulSmallM1Act_SQ8_Internal(Name, 0, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL_SM1, ActOper);
 			if (!Ok&&Log) printf("Mapping this convolution to matrix multiplication with small first operand FAILED, trying with standard mult implementation\n");
 			if (Ok) return Ok;
 		}
 		if (Log) printf("Mapping this convolution to matrix multiplication\n");
-		int Ok = CNN_MatMulAct_SQ8(Name, 0, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL, ActOper);
+		Kernel_T *Ok = CNN_MatMulAct_SQ8_Internal(Name, 0, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL, ActOper, 1);
 		AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
 		if (Ok) return Ok;
 		if (Log) printf("Mapping this convolution to matrix multiplication FAILED, reverting to standard implementation\n");
@@ -2159,7 +2167,7 @@ int CNN_ConvolutionPoolAct_SQ8(
 				      PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PadInp,
 				      ActOper);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -2345,7 +2353,7 @@ int CNN_GroupedConvolutionPoolAct_SQ8(
 		
 *********************************************************************************************************************************************************************/
 
-int CNN_PoolAct_SQ8(
+Kernel_T * CNN_PoolAct_SQ8_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -2367,7 +2375,7 @@ int CNN_PoolAct_SQ8(
 	)
 
 {
-	if (PoolOper==KOP_NONE && ActOper!=KOP_NONE) return CNN_Act_SQ8(Name, Ctrl, Feat, Width, Height, ActOper);
+	if (PoolOper==KOP_NONE && ActOper!=KOP_NONE) return CNN_Act_SQ8_Internal(Name, Ctrl, Feat, Width, Height, ActOper);
 
 	Tile_Orientation_T TileOrientation = TILE_HOR;
 	int ParFeat = 1, HWC = 0;
@@ -2524,10 +2532,11 @@ int CNN_PoolAct_SQ8(
 
 		AT_PrepareForTest_SQ8(Name, Feat, Feat, Width, Height, 1, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PadInp, 0, 0,0,0,0,0,0,(v4s) 0, ActOper);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 
+
 /*********************************************************************************************************************************************************************
  	Generator for Activation with tensor centric scaling
 
@@ -2549,7 +2558,7 @@ int CNN_PoolAct_SQ8(
 		
 *********************************************************************************************************************************************************************/
 
-int CNN_Act_SQ8(
+Kernel_T * CNN_Act_SQ8_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -2626,9 +2635,10 @@ int CNN_Act_SQ8(
 
 		AT_PrepareForTest_SQ8(Name, Feat,Feat,Width,Height, 1, ActOper, 0,0,0,0,0,0,(v4s) 0, 0, 0,0,0,0,0,0,(v4s) 0, KOP_NONE);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
+
 /*********************************************************************************************************************************************************************
  	Generator for Global Pooling (Max or Average) with tensor centric scaling and optional activation
 
@@ -2652,7 +2662,7 @@ int CNN_Act_SQ8(
 		
 *********************************************************************************************************************************************************************/
 
-int CNN_GlobalPoolAct_SQ8(
+static Kernel_T *CNN_GlobalPoolAct_SQ8_Interal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -2843,10 +2853,9 @@ int CNN_GlobalPoolAct_SQ8(
 
 		AT_PrepareForTest_SQ8(Name, Feat,Feat,Width,Height, 1, PoolOper, 0,0,0,0,0,0, (v4s)0, 0, 0,0,0,0,0,0,(v4s) 0, ActOper);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
-
 /*********************************************************************************************************************************************************************
  	Generator for Linear layers followed wth channel centric scaling followed by an optional activation
 
@@ -2870,7 +2879,7 @@ int CNN_GlobalPoolAct_SQ8(
 	
 *********************************************************************************************************************************************************************/
 
-int CNN_LinearAct_SQ8(
+Kernel_T * CNN_LinearAct_SQ8_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -3074,7 +3083,7 @@ int CNN_LinearAct_SQ8(
 
 		AT_PrepareForTest_SQ8(Name, InDim,OutDim,1,1, Bias_DataSize, LinearOper, 0,0,0,0,0,0,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, ActOper);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -3096,7 +3105,7 @@ int CNN_LinearAct_SQ8(
                 a different code. By definition Output contains value is the [0.0 .. 1.0] range with sum(Output)=1.0. Results are always represented in Q15
 *********************************************************************************************************************************************************************/
 
-int CNN_SoftMax_SQ8(
+static Kernel_T * CNN_SoftMax_SQ8_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -3158,10 +3167,10 @@ int CNN_SoftMax_SQ8(
 
 		AT_PrepareForTest_SQ8(Name, Dim,Dim,1,1, 1, SoftMaxOper, 0,0,0,0,0,0,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, 0);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
-int CNN_SoftMax2D_SQ8(
+static Kernel_T * CNN_SoftMax2D_SQ8_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -3231,7 +3240,7 @@ int CNN_SoftMax2D_SQ8(
 
 		AT_PrepareForTest_SQ8(Name, Dim,Dim,1,1, 1, SoftMaxOper, 0,0,0,0,0,0,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, 0);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -3255,7 +3264,7 @@ int CNN_SoftMax2D_SQ8(
 	
 *********************************************************************************************************************************************************************/
 
-int CNN_MatAddAct_SQ8(
+static Kernel_T * CNN_MatAddAct_SQ8_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -3350,7 +3359,7 @@ int CNN_MatAddAct_SQ8(
 
 		AT_PrepareForTest_SQ8(Name, Feat,Feat,Width,Height, 1, AddMatOper, 0,0,0,0,0,0,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, ActOper);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -3711,7 +3720,7 @@ int CNN_TensorVectMultAct_SQ8(
 	
 *********************************************************************************************************************************************************************/
 
-int CNN_MatMulAct_SQ8(
+Kernel_T *CNN_MatMulAct_SQ8_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -3730,7 +3739,8 @@ int CNN_MatMulAct_SQ8(
 	int Scy,
 
         KernelOper_T MatMulOper,
-        KernelOper_T ActOper
+        KernelOper_T ActOper,
+        int InvertInputs
 	)
 
 {
@@ -3804,8 +3814,8 @@ int CNN_MatMulAct_SQ8(
 		KernelIterSpace(2, IterTiledSpace(T1), IterTiledSpace(T0)),
                 TILE_HOR,
                 CArgs(7,
-                      TCArg(CNN_ArgDataType(1,1,1),  "In2"),
-                      TCArg(CNN_ArgDataType(1,1,1),  "In1"),
+                      TCArg(CNN_ArgDataType(1,1,1),  InvertInputs?"In2":"In1"),
+                      TCArg(CNN_ArgDataType(1,1,1),  InvertInputs?"In1":"In2"),
                       Bias_DataSize?TCArg(CNN_ArgDataType(Bias_DataSize,1,1), "Bias"):AT_NO_C_ARG,
                       TCArg(CNN_ArgDataType(1,1,1),  "Out"),
                       !ScaleScalar?TCArg(CNN_ArgDataTypeUns(1,1,1),"Scale"):AT_NO_C_ARG,
@@ -3879,7 +3889,7 @@ int CNN_MatMulAct_SQ8(
 
 		AT_PrepareForTest_SQ8(Name, ColM1,LineM1,Width,Height, Bias_DataSize, MatMulOper, 1,1,1,1,Scx,Scy,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, ActOper);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -3920,7 +3930,7 @@ int CNN_MatMulAct_SQ8(
 	
 *********************************************************************************************************************************************************************/
 
-int CNN_MatMulSmallM1Act_SQ8(
+Kernel_T * CNN_MatMulSmallM1Act_SQ8_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -4077,7 +4087,295 @@ int CNN_MatMulSmallM1Act_SQ8(
 
 		AT_PrepareForTest_SQ8(Name, ColM1,LineM1,Width,Height, Bias_DataSize, MatMulOper, 1,1,1,1,Scx,Scy,(v4s)0, 0, 0,0,0,0,0,0,(v4s)0, ActOper);
 	}
-	return (Kernel!=0);
+	return Kernel;
+
+}
+
+/* ============================================================================================================================================================== */
+/* ============================================================================================================================================================== */
+/* ============================================================================================================================================================== */
+/* ============================================================================================================================================================== */
+/* ============================================================================================================================================================== */
+/* ============================================================================================================================================================== */
+/* ============================================================================================================================================================== */
+
+int CNN_MM_ConvolutionPoolAct_SQ8(
+	char         *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+	int Bias_DataSize,
+	int Scale_DataSize,
+
+       	int InFeat,
+       	int OutFeat,
+       	int Width,
+       	int Height,
+
+	KernelOper_T ConvOper,
+       	int Fcx,
+       	int Fcy,
+	int Dcx,
+	int Dcy,
+	int Scx,
+	int Scy,
+	int ConvPad,
+
+	KernelOper_T PoolOper,
+	int Fpx,
+	int Fpy,
+	int Dpx,
+	int Dpy,
+	int Spx,
+	int Spy,
+	int PoolPad,
+
+	KernelOper_T ActOper
+	)
+{
+	Kernel_T *Ker = 0, *Sol1 = 0, *Sol2 = 0;
+        float K = 0.9;
+        Tile_Orientation_T TileOrientation = TILE_HOR;
+        if (Ctrl) {
+		if (Ctrl->TileOrientation != -1) {
+			printf("TileOrientation set by user\n");
+			Ker = CNN_MM_ConvolutionPoolAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+			if (Ker!=0) return 1;
+			else GenTilingError("CNN_MM_ConvolutionPoolAct_SQ8: %s, Failed to gen with set tiling orientation, try to let the Autotiler set it for you", Name);
+		}
+	}
+
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+
+	CNN_GenControl_T InternalCtrl;
+	if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl);
+    	else 	   InternalCtrl = *Ctrl;
+
+	printf("\n\n=============================== Trying Tile Orientation: TILE_HOR ===============================\n\n");
+    	CNN_SetGenCtrl(&InternalCtrl, "TILEORIENTATION", AT_OPT_VAL(0));
+        Ker = CNN_MM_ConvolutionPoolAct_SQ8_Internal(Name, &InternalCtrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+        if (Ker) Sol1 = CopyAndPopUserKernel(Ker);
+
+	printf("\n=============================== Trying Tile Orientation: TILE_VER ===============================\n\n");
+    	CNN_SetGenCtrl(&InternalCtrl, "TILEORIENTATION", AT_OPT_VAL(1));
+        Ker = CNN_MM_ConvolutionPoolAct_SQ8_Internal(Name, &InternalCtrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+        if (Ker) Sol2 = CopyAndPopUserKernel(Ker);
+
+        if (Sol1 && Sol2) {
+		int TakeSol1 = ((K*Sol1->Cost->TileOverhead) < Sol2->Cost->TileOverhead);  // K close to 1.0if (TakeSol1) {
+		printf(">>>>>>>>>>>>>>>>>> %s is better: %.3f vs %.3f \n\n\n", TakeSol1?"TILE_HOR":"TILE_VER", Sol1->Cost->TileOverhead, Sol2->Cost->TileOverhead);
+		if (TakeSol1) {
+                    PushBackUserKernel(Sol1); ReleaseUserKerne(Sol2);
+                } else {
+                    PushBackUserKernel(Sol2); ReleaseUserKerne(Sol1);
+                }
+        } else if (Sol1) {
+                PushBackUserKernel(Sol1); ReleaseUserKerne(Sol2);
+	} else if (Sol2) {
+                PushBackUserKernel(Sol2); ReleaseUserKerne(Sol1);
+	} else {
+		GenTilingError("Failed to Generate code for Kernel: %s", Name);
+	}
+        AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+        return 1;
+}
+
+int CNN_HWC_DWConvolutionPoolAct_SQ8(
+	char         *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+	int Bias_DataSize,
+	int Scale_DataSize,
+
+       	int InFeat,
+       	int OutFeat,
+       	int Width,
+       	int Height,
+
+	KernelOper_T ConvOper,
+       	int Fcx,
+       	int Fcy,
+	int Dcx,
+	int Dcy,
+	int Scx,
+	int Scy,
+	int ConvPad,
+
+	KernelOper_T PoolOper,
+	int Fpx,
+	int Fpy,
+	int Dpx,
+	int Dpy,
+	int Spx,
+	int Spy,
+	int PoolPad,
+
+	KernelOper_T ActOper
+	)
+{
+	Kernel_T *Ker = 0, *Sol1 = 0, *Sol2 = 0;
+        float K = 0.9;
+        Tile_Orientation_T TileOrientation = TILE_HOR;
+        if (Ctrl) {
+		if (Ctrl->TileOrientation != -1) {
+			printf("TileOrientation set by user\n");
+			Ker = CNN_HWC_DWConvolutionPoolAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+			if (Ker!=0) return 1;
+			else GenTilingError("CNN_MM_ConvolutionPoolAct_SQ8: %s, Failed to gen with set tiling orientation, try to let the Autotiler set it for you", Name);
+		}
+	}
+
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+
+	CNN_GenControl_T InternalCtrl;
+	if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl);
+    	else 	   InternalCtrl = *Ctrl;
+
+	printf("\n\n=============================== Trying Tile Orientation: TILE_HOR ===============================\n\n");
+    	CNN_SetGenCtrl(&InternalCtrl, "TILEORIENTATION", AT_OPT_VAL(0));
+        Ker = CNN_HWC_DWConvolutionPoolAct_SQ8_Internal(Name, &InternalCtrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+        if (Ker) Sol1 = CopyAndPopUserKernel(Ker);
+
+	printf("\n=============================== Trying Tile Orientation: TILE_VER ===============================\n\n");
+    	CNN_SetGenCtrl(&InternalCtrl, "TILEORIENTATION", AT_OPT_VAL(1));
+        Ker = CNN_HWC_DWConvolutionPoolAct_SQ8_Internal(Name, &InternalCtrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+        if (Ker) Sol2 = CopyAndPopUserKernel(Ker);
+
+        if (Sol1 && Sol2) {
+		int TakeSol1 = ((K*Sol1->Cost->TileOverhead) < Sol2->Cost->TileOverhead);  // K close to 1.0if (TakeSol1) {
+		printf(">>>>>>>>>>>>>>>>>> %s is better: %.3f vs %.3f \n\n\n", TakeSol1?"TILE_HOR":"TILE_VER", Sol1->Cost->TileOverhead, Sol2->Cost->TileOverhead);
+		if (TakeSol1) {
+                    PushBackUserKernel(Sol1); ReleaseUserKerne(Sol2);
+                } else {
+                    PushBackUserKernel(Sol2); ReleaseUserKerne(Sol1);
+                }
+        } else if (Sol1) {
+                PushBackUserKernel(Sol1); ReleaseUserKerne(Sol2);
+	} else if (Sol2) {
+                PushBackUserKernel(Sol2); ReleaseUserKerne(Sol1);
+	} else {
+		GenTilingError("Failed to Generate code for Kernel: %s", Name);
+	}
+        AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+        return 1;
+}
+
+int CNN_ConvolutionPoolAct_SQ8(
+	char         *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+	int Bias_DataSize,
+	int Scale_DataSize,
+
+       	int InFeat,
+       	int OutFeat,
+       	int Width,
+       	int Height,
+
+	KernelOper_T ConvOper,
+       	int Fcx,
+       	int Fcy,
+	int Dcx,
+	int Dcy,
+	int Scx,
+	int Scy,
+	int ConvPad,
+
+	KernelOper_T PoolOper,
+	int Fpx,
+	int Fpy,
+	int Dpx,
+	int Dpy,
+	int Spx,
+	int Spy,
+	int PoolPad,
+
+	KernelOper_T ActOper
+	)
+{
+	Kernel_T *Ker = 0, *Sol1 = 0, *Sol2 = 0;
+        float K = 0.9;
+        Tile_Orientation_T TileOrientation = TILE_HOR;
+        if (Ctrl) {
+		if (Ctrl->TileOrientation != -1) {
+			printf("TileOrientation set by user\n");
+			Ker = CNN_ConvolutionPoolAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+			if (Ker!=0) return 1;
+			else GenTilingError("CNN_ConvolutionPoolAct_SQ8: %s, Failed to gen with set tiling orientation, try to let the Autotiler set it for you", Name);
+		}
+	}
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+
+	CNN_GenControl_T InternalCtrl;
+	if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl);
+    	else 	   InternalCtrl = *Ctrl;
+
+	printf("\n\n=============================== Trying Tile Orientation: TILE_HOR ===============================\n\n");
+    	CNN_SetGenCtrl(&InternalCtrl, "TILEORIENTATION", AT_OPT_VAL(0));
+        Ker = CNN_ConvolutionPoolAct_SQ8_Internal(Name, &InternalCtrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+        if (Ker) Sol1 = CopyAndPopUserKernel(Ker);
+
+	printf("\n=============================== Trying Tile Orientation: TILE_VER ===============================\n\n");
+    	CNN_SetGenCtrl(&InternalCtrl, "TILEORIENTATION", AT_OPT_VAL(1));
+        Ker = CNN_ConvolutionPoolAct_SQ8_Internal(Name, &InternalCtrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+        if (Ker) Sol2 = CopyAndPopUserKernel(Ker);
+
+        if (Sol1 && Sol2) {
+		int TakeSol1 = ((K*Sol1->Cost->TileOverhead) < Sol2->Cost->TileOverhead);  // K close to 1.0if (TakeSol1) {
+		printf(">>>>>>>>>>>>>>>>>> %s is better: %.3f vs %.3f \n\n\n", TakeSol1?"TILE_HOR":"TILE_VER", Sol1->Cost->TileOverhead, Sol2->Cost->TileOverhead);
+		if (TakeSol1) {
+                    PushBackUserKernel(Sol1); ReleaseUserKerne(Sol2);
+                } else {
+                    PushBackUserKernel(Sol2); ReleaseUserKerne(Sol1);
+                }
+        } else if (Sol1) {
+                PushBackUserKernel(Sol1); ReleaseUserKerne(Sol2);
+	} else if (Sol2) {
+                PushBackUserKernel(Sol2); ReleaseUserKerne(Sol1);
+	} else {
+		GenTilingError("Failed to Generate code for Kernel: %s", Name);
+	}
+        AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+        return 1;
+}
+
+int CNN_PoolAct_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Feat, int Width, int Height, KernelOper_T PoolOper, int Fpx, int Fpy, int Dpx, int Dpy, int Spx, int Spy, int PoolPad, KernelOper_T ActOper)
+{
+	return (CNN_PoolAct_SQ8_Internal(Name, Ctrl, Feat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper)!=0);
+}
+
+int CNN_Act_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Feat, int Width, int Height,KernelOper_T ActOper) {
+	return (CNN_Act_SQ8_Internal(Name, Ctrl, Feat, Width, Height, ActOper)!=0);
+}
+
+int CNN_GlobalPoolAct_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Feat, int Width, int Height, KernelOper_T PoolOper, KernelOper_T ActOper) {
+	return (CNN_GlobalPoolAct_SQ8_Interal(Name, Ctrl, Feat, Width, Height, PoolOper, ActOper)!=0);
+}
+
+int CNN_LinearAct_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Bias_DataSize, int Scale_DataSize, int InDim, int OutDim, KernelOper_T LinearOper, KernelOper_T ActOper)
+{
+	return (CNN_LinearAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InDim, OutDim, LinearOper, ActOper)!=0);
+}
+
+int CNN_SoftMax_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Dim, KernelOper_T SoftMaxOper) {
+	return (CNN_SoftMax_SQ8_Internal(Name, Ctrl, Dim, SoftMaxOper)!=0);
+}
+
+int CNN_SoftMax2D_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Dim, int N, KernelOper_T SoftMaxOper) {
+	return (CNN_SoftMax2D_SQ8_Internal(Name, Ctrl, Dim, N, SoftMaxOper)!=0);
+}
+
+int CNN_MatAddAct_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Feat, int Width, int Height, KernelOper_T AddMatOper, KernelOper_T ActOper) {
+	return (CNN_MatAddAct_SQ8_Internal(Name, Ctrl, Feat, Width, Height, AddMatOper, ActOper)!=0);
+}
+
+int CNN_MatMulAct_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Bias_DataSize, int Scale_DataSize, int ColM1, int LineM1, int ColM2, int LineM2, int Width, int Height, int Scx, int Scy, KernelOper_T MatMulOper, KernelOper_T ActOper) {
+	return (CNN_MatMulAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, ColM1, LineM1, ColM2, LineM2, Width, Height, Scx, Scy, MatMulOper, ActOper, 1)!=0);
+}
 
+int CNN_MatMulSmallM1Act_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Bias_DataSize, int Scale_DataSize, int ColM1, int LineM1, int ColM2, int LineM2, int Width, int Height, int Scx, int Scy, KernelOper_T MatMulOper, KernelOper_T ActOper) {
+	return (CNN_MatMulSmallM1Act_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, ColM1, LineM1, ColM2, LineM2, Width, Height, Scx, Scy, MatMulOper, ActOper)!=0);
 }
 
diff --git a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.h b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.h
index 6b0ca7d15..ba223b55d 100644
--- a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.h
+++ b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.h
@@ -67,6 +67,40 @@ void LoadCNN_SQ8_Library();
 	
 *********************************************************************************************************************************************************************/
 
+Kernel_T *CNN_ConvolutionPoolAct_SQ8_Internal(
+	char         *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+	int Bias_DataSize,
+	int Scale_DataSize,
+
+       	int InFeat,
+       	int OutFeat,
+       	int Width,
+       	int Height,
+
+	KernelOper_T ConvOper,
+       	int Fcx,
+       	int Fcy,
+	int Dcx,
+	int Dcy,
+	int Scx,
+	int Scy,
+	int ConvPad,
+
+	KernelOper_T PoolOper,
+	int Fpx,
+	int Fpy,
+	int Dpx,
+	int Dpy,
+	int Spx,
+	int Spy,
+	int PoolPad,
+
+	KernelOper_T ActOper
+	);
+
 int CNN_ConvolutionPoolAct_SQ8(
 	char         *Name,
 
@@ -302,6 +336,27 @@ int CNN_PoolAct_SQ8(
 	KernelOper_T ActOper
 	);
 
+Kernel_T *CNN_PoolAct_SQ8_Internal(
+	char *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+       	int Feat,
+       	int Width,
+       	int Height,
+
+	KernelOper_T PoolOper,
+	int Fpx,
+	int Fpy,
+	int Dpx,
+	int Dpy,
+	int Spx,
+	int Spy,
+	int PoolPad,
+
+	KernelOper_T ActOper
+	);
+
 /*********************************************************************************************************************************************************************
  	Generator for Activation with tensor centric scaling
 
@@ -335,6 +390,18 @@ int CNN_Act_SQ8(
        	KernelOper_T ActOper
        	);
 
+Kernel_T *CNN_Act_SQ8_Internal(
+	char *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+       	int Feat,
+       	int Width,
+       	int Height,
+
+       	KernelOper_T ActOper
+       	);
+
 
 /*********************************************************************************************************************************************************************
  	Generator for Global Pooling (Max or Average) with tensor centric scaling and optional activation
@@ -411,6 +478,21 @@ int CNN_LinearAct_SQ8(
 	KernelOper_T ActOper
 	);
 
+Kernel_T *CNN_LinearAct_SQ8_Internal(
+	char *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+	int Bias_DataSize,
+	int Scale_DataSize,
+
+	int InDim,
+	int OutDim,
+
+	KernelOper_T LinearOper,
+	KernelOper_T ActOper
+	);
+
 /*********************************************************************************************************************************************************************
  	Generator for SoftMax layers, no scaling
 
@@ -616,6 +698,30 @@ int CNN_MatMulAct_SQ8(
         KernelOper_T ActOper
 	);
 
+Kernel_T *CNN_MatMulAct_SQ8_Internal(
+	char *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+	int Bias_DataSize,
+	int Scale_DataSize,
+
+	int ColM1,
+	int LineM1,
+	int ColM2,
+	int LineM2,
+
+	int Width,
+	int Height,
+	int Scx,
+	int Scy,
+
+        KernelOper_T MatMulOper,
+        KernelOper_T ActOper,
+        int InvertInputs
+	);
+
+
 /*********************************************************************************************************************************************************************
  	Generator for Matrix Multiplication layers with channel centric scaling followed by an optional Activation.
 	Special form to handle small form factor In1 (InFeat x OutFeat)
@@ -676,4 +782,26 @@ int CNN_MatMulSmallM1Act_SQ8(
         KernelOper_T ActOper
 	);
 
+Kernel_T  *CNN_MatMulSmallM1Act_SQ8_Internal(
+	char *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+	int Bias_DataSize,
+	int Scale_DataSize,
+
+	int ColM1,
+	int LineM1,
+	int ColM2,
+	int LineM2,
+
+	int Width,
+	int Height,
+	int Scx,
+	int Scy,
+
+        KernelOper_T MatMulOper,
+        KernelOper_T ActOper
+	);
+
 #endif
diff --git a/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.c b/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.c
index 40ca2284c..659e2e34e 100644
--- a/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.c
+++ b/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.c
@@ -595,7 +595,7 @@ void LoadCNNLibrary_fp16()
 
 *********************************************************************************************************************************************************************/
 
-int CNN_MM_ConvolutionPoolAct_fp16(
+Kernel_T *CNN_MM_ConvolutionPoolAct_fp16_Internal(
 	char         *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -629,20 +629,21 @@ int CNN_MM_ConvolutionPoolAct_fp16(
 {
 	if (ConvOper==KOP_NONE) {
 		if (PoolOper!=KOP_NONE)
-			return CNN_PoolAct_fp16(Name, Ctrl, InFeat, OutFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+			return CNN_PoolAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
 		else if (ActOper!=KOP_NONE)
-			return CNN_Act_fp16(Name, Ctrl, InFeat, Width, Height, ActOper);
+			return CNN_Act_fp16_Internal(Name, Ctrl, InFeat, Width, Height, ActOper);
 		else GenTilingError("CNN_MM_ConvolutionPoolAct_fp16 Kernel: %s, All requested operations are KOP_NONE", Name);
 	}
 
 	int ParFeat = 1, HWC = 0, ParFeatConv = 2;
         float UB = (ActOper==KOP_HSIGMOID)?3.0:6.0; // In Case of HSIGMOID, UB is the Offset (default: 3.0)
-	Tile_Orientation_T TileOrientation = Height>1?TILE_HOR:TILE_VER;
+	Tile_Orientation_T TileOrientation = TILE_HOR;
 	AT_PadType PadType = PAD_BALANCED_LEFT;
 	if (PoolOper==KOP_NONE) {
 		Fpx=1; Dpx=1; Spx=1; Fpy=1; Dpy=1; Spy=1;
 	}
 	if (Ctrl) {
+		if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER;
 		if (Ctrl->PadType != -1) PadType = Ctrl->PadType;
 		if (Ctrl->HWC) HWC = 1;
 		if (Ctrl->ParallelFeatures != -1) ParFeatConv = Ctrl->ParallelFeatures;
@@ -742,7 +743,7 @@ int CNN_MM_ConvolutionPoolAct_fp16(
 	}
 
 	if (Log) {
-		printf("InFeat: %d, OutFeat: %d%s%s\n", InFeat, OutFeat, HWC?", HWC":", CHW", ParFeatConv?", Out Chan Parallel":", H Parallel");
+		printf("InFeat: %d, OutFeat: %d%s - TileOrientation: %s\n", InFeat, OutFeat, HWC?", HWC":", CHW", TileOrientation==TILE_HOR?"TILE_HOR":"TILE_VER");
         	printf("Conv => W:  %d, Pad:[%d,%d] PadT:[%d,%d] => Wc: %d, Filter:[%d,%d]\n", Width,  PadInc[0], PadInc[1], PadIncT[0], PadIncT[1], Wc, Fcx, Fcy);
         	printf("     => H:  %d, Pad:[%d,%d] PadT:[%d,%d] => Hc: %d\n", Height, PadInc[2], PadInc[3], PadIncT[2], PadIncT[3], Hc);
         	printf("Pool => Wc: %d, Pad:[%d,%d] => Wo: %d, Filter:[%d,%d]\n", Wc, PadInp[0], PadInp[1], Wo, Fpx, Fpy);
@@ -881,11 +882,11 @@ int CNN_MM_ConvolutionPoolAct_fp16(
                                   Fcx, Fcy, Scx, Scy, Dcx, Dcy, PadInc, Fpx, Fpy, Spx, Spy, Dpx, Dpy, PadInp, KernelOper,
                                   0, 0);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 
-int CNN_HWC_DWConvolutionPoolAct_fp16(
+Kernel_T *CNN_HWC_DWConvolutionPoolAct_fp16_Internal(
 	char         *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -919,9 +920,9 @@ int CNN_HWC_DWConvolutionPoolAct_fp16(
 {
 	if (ConvOper==KOP_NONE) {
 		if (PoolOper!=KOP_NONE)
-			return CNN_PoolAct_fp16(Name, Ctrl, InFeat, OutFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+			return CNN_PoolAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
 		else if (ActOper!=KOP_NONE)
-			return CNN_Act_fp16(Name, Ctrl, InFeat, Width, Height, ActOper);
+			return CNN_Act_fp16_Internal(Name, Ctrl, InFeat, Width, Height, ActOper);
 		else GenTilingError("CNN_HWC_DWConvolutionPoolAct_fp16: %s, All requested operations are KOP_NONE", Name);
 	}
 
@@ -933,6 +934,7 @@ int CNN_HWC_DWConvolutionPoolAct_fp16(
 		Fpx=1; Dpx=1; Spx=1; Fpy=1; Dpy=1; Spy=1;
 	}
 	if (Ctrl) {
+		if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER;
 		if (Ctrl->PadType != -1) PadType = Ctrl->PadType;
                 if (Ctrl->ReluN != -1) UB = Ctrl->ReluN;
 	}
@@ -1024,7 +1026,7 @@ int CNN_HWC_DWConvolutionPoolAct_fp16(
 	}
 
 	if (Log) {
-		printf("InFeat: %d, OutFeat: %d%s\n", InFeat, OutFeat, HWC?", HWC":", CHW");
+		printf("InFeat: %d, OutFeat: %d%s - TileOrientation: %s\n", InFeat, OutFeat, HWC?", HWC":", CHW", TileOrientation==TILE_HOR?"TILE_HOR":"TILE_VER");
         	printf("Conv => W:  %d, Pad:[%d,%d] PadT:[%d,%d] => Wc: %d, Filter:[%d,%d]\n", Width,  PadInc[0], PadInc[1], PadIncT[0], PadIncT[1], Wc, Fcx, Fcy);
         	printf("     => H:  %d, Pad:[%d,%d] PadT:[%d,%d] => Hc: %d\n", Height, PadInc[2], PadInc[3], PadIncT[2], PadIncT[3], Hc);
         	printf("Pool => Wc: %d, Pad:[%d,%d] => Wo: %d, Filter:[%d,%d]\n", Wc, PadInp[0], PadInp[1], Wo, Fpx, Fpy);
@@ -1046,7 +1048,7 @@ int CNN_HWC_DWConvolutionPoolAct_fp16(
 	UserSymbols(1, US_Float("UB", UB));
         Kernel_T *Kernel = UserKernel(Name,
 		KernelIterSpace(2, IterTiledSpace(T0), IterParSpace(D0, InFeat, 8)),
-                TILE_HOR,
+                TileOrientation,
                 CArgs(4,
                       TCArg(CNN_ArgDataTypeF(2,1,1), "In"),
                       TCArg(CNN_ArgDataTypeF(2,1,1), "Filter"),
@@ -1154,7 +1156,7 @@ int CNN_HWC_DWConvolutionPoolAct_fp16(
                                   Fcx, Fcy, Scx, Scy, Dcx, Dcy, PadInc, Fpx, Fpy, Spx, Spy, Dpx, Dpy, PadInp, KernelOper,
                                   0, 0);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -1196,36 +1198,36 @@ int CNN_HWC_DWConvolutionPoolAct_fp16(
 	
 *********************************************************************************************************************************************************************/
 
-int CNN_ConvolutionPoolAct_fp16(
-			char         *Name,
+Kernel_T *CNN_ConvolutionPoolAct_fp16_Internal(
+	char         *Name,
 
-			CNN_GenControl_T *Ctrl,
+	CNN_GenControl_T *Ctrl,
 
-                        int InFeat,
-                        int OutFeat,
-                        int Width,
-                        int Height,
+        int InFeat,
+        int OutFeat,
+        int Width,
+        int Height,
 
-			KernelOper_T ConvOper,
-                        int Fcx,
-                        int Fcy,
-			int Dcx,
-			int Dcy,
-			int Scx,
-			int Scy,
-			int ConvPad,
+	KernelOper_T ConvOper,
+        int Fcx,
+        int Fcy,
+	int Dcx,
+	int Dcy,
+	int Scx,
+	int Scy,
+	int ConvPad,
 
-			KernelOper_T PoolOper,
-			int Fpx,
-			int Fpy,
-			int Dpx,
-			int Dpy,
-			int Spx,
-			int Spy,
-			int PoolPad,
+	KernelOper_T PoolOper,
+	int Fpx,
+	int Fpy,
+	int Dpx,
+	int Dpy,
+	int Spx,
+	int Spy,
+	int PoolPad,
 
-			KernelOper_T ActOper
-			)
+	KernelOper_T ActOper
+	)
 
 {
         int Log=1;
@@ -1255,29 +1257,32 @@ int CNN_ConvolutionPoolAct_fp16(
 
         if (ConvOper==KOP_NONE) {
                 if (PoolOper!=KOP_NONE)
-                        return CNN_PoolAct_fp16(Name, Ctrl, InFeat, OutFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+                        return CNN_PoolAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
                 else if (ActOper!=KOP_NONE)
-                        return CNN_Act_fp16(Name, Ctrl, InFeat, Width, Height, ActOper);
+                        return CNN_Act_fp16_Internal(Name, Ctrl, InFeat, Width, Height, ActOper);
                 else GenTilingError("CNN_ConvolutionPoolAct_fp16 Kernel: %s, All requested operations are KOP_NONE", Name);
         } else if (HWC) {
                 if (ConvOper == KOP_CONV_DW)
-                        return CNN_HWC_DWConvolutionPoolAct_fp16(Name, Ctrl, InFeat, OutFeat, Width, Height,
+                        return CNN_HWC_DWConvolutionPoolAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, Width, Height,
                                                                  ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad,
                                                                  PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
                 else
-                        return CNN_MM_ConvolutionPoolAct_fp16(Name, Ctrl, InFeat, OutFeat, Width, Height,
+                        return CNN_MM_ConvolutionPoolAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, Width, Height,
                                                               ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad,
                                                               PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
         } else if (ConvOper==KOP_CONV && ((Fcx > 1 && Fcy == 1))) {
-                AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
-                int Ok = CNN_MM_ConvolutionPoolAct_fp16(Name, Ctrl, InFeat, OutFeat, Width, Height,
+                //AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+                Kernel_T *Ok = CNN_MM_ConvolutionPoolAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, Width, Height,
                                                         ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad,
                                                         PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
-                AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
-                if (Ok) return Ok;
+                //AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+                if (Ok!=0) return Ok;
                 if (Log) printf("Mapping this convolution to im2col scheme failed, reverting to standard implementation\n");
         }
-
+	if (Fcx==1 && Fcy==1 && Scx==1 && Scy==1 && Dcx==1 && Dcy==1 && Height==1 && Width==1) {
+		printf("This is a pointwise on 1x1 input --> Mapping to CNN_Linear_NE16\n");
+		return CNN_LinearAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, KOP_LINEAR, ActOper);
+	}
 
         if (PoolOper==KOP_NONE) {
                 Fpx=1; Dpx=1; Spx=1; Fpy=1; Dpy=1; Spy=1;
@@ -1363,7 +1368,7 @@ int CNN_ConvolutionPoolAct_fp16(
                 if (ActKerName==0) GenTilingError("CNN_ConvolutionPoolAct_fp16 Kernel: %s, Can't find a matching Activation basic kernel", Name);
         }
         if (Log) {
-                printf("InFeat: %d, OutFeat: %d\n", InFeat, OutFeat);
+		printf("InFeat: %d, OutFeat: %d%s - TileOrientation: %s\n", InFeat, OutFeat, HWC?", HWC":", CHW", TileOrientation==TILE_HOR?"TILE_HOR":"TILE_VER");
                 printf("Conv => W:  %d, Pad:[%d,%d] PadT:[%d,%d] => Wc: %d, Filter:[%d,%d]\n", Width,  PadInc[0], PadInc[1], PadIncT[0], PadIncT[1], Wc, Fcx, Fcy);
                 printf("     => H:  %d, Pad:[%d,%d] PadT:[%d,%d] => Hc: %d\n", Height, PadInc[2], PadInc[3], PadIncT[2], PadIncT[3], Hc);
                 printf("Pool => Wc: %d, Pad:[%d,%d] => Wo: %d, Filter:[%d,%d]\n", Wc, PadInp[0], PadInp[1], Wo, Fpx, Fpy);
@@ -1384,12 +1389,12 @@ int CNN_ConvolutionPoolAct_fp16(
                 AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
                 if ((InFeat+OutFeat)<100) {
                         if (Log) printf("Mapping this convolution to matrix multiplication with small first operand\n");
-                        int Ok = CNN_MatMulSmallM1Act_fp16(Name, 0, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL_SM1, ActOper);
+                        Kernel_T *Ok = CNN_MatMulSmallM1Act_fp16_Internal(Name, 0, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL_SM1, ActOper);
                         if (!Ok&&Log) printf("Mapping this convolution to matrix multiplication with small first operand FAILED, trying with standard mult implementation\n");
                         if (Ok) return Ok;
                 }
                 if (Log) printf("Mapping this convolution to matrix multiplication\n");
-                int Ok = CNN_MatMulAct_fp16(Name, 0, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL, ActOper);
+                Kernel_T *Ok = CNN_MatMulAct_fp16_Internal(Name, 0, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL, ActOper, 1);
                 AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
                 if (Ok) return Ok;
                 if (Log) printf("Mapping this convolution to matrix multiplication FAILED, reverting to standard implementation\n");
@@ -1519,7 +1524,7 @@ int CNN_ConvolutionPoolAct_fp16(
 				  Fcx, Fcy, Scx, Scy, Dcx, Dcy, PadInc, Fpx, Fpy, Spx, Spy, Dpx, Dpy, PadInp, KernelOper,
 				  0, 0);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -1692,30 +1697,30 @@ int CNN_GroupedConvolutionPoolAct_fp16(
 		
 *********************************************************************************************************************************************************************/
 
-int CNN_PoolAct_fp16(
-			char *Name,
+Kernel_T *CNN_PoolAct_fp16_Internal(
+	char *Name,
 
-			CNN_GenControl_T *Ctrl,
+	CNN_GenControl_T *Ctrl,
 
-                        int InFeat,
-                        int OutFeat,
-                        int Width,
-                        int Height,
+        int InFeat,
+        int OutFeat,
+        int Width,
+        int Height,
 
-			KernelOper_T PoolOper,
-			int Fpx,
-			int Fpy,
-			int Dpx,
-			int Dpy,
-			int Spx,
-			int Spy,
-			int PoolPad,
+	KernelOper_T PoolOper,
+	int Fpx,
+	int Fpy,
+	int Dpx,
+	int Dpy,
+	int Spx,
+	int Spy,
+	int PoolPad,
 
-			KernelOper_T ActOper
-			)
+	KernelOper_T ActOper
+	)
 
 {
-        if (PoolOper==KOP_NONE && ActOper!=KOP_NONE) return CNN_Act_fp16(Name, Ctrl, InFeat, Width, Height, ActOper);
+        if (PoolOper==KOP_NONE && ActOper!=KOP_NONE) return CNN_Act_fp16_Internal(Name, Ctrl, InFeat, Width, Height, ActOper);
 
         Tile_Orientation_T TileOrientation = TILE_HOR;
         int ParFeat = 1, HWC = 0;
@@ -1875,7 +1880,7 @@ int CNN_PoolAct_fp16(
 				  KernelOper,
 				  0, 0);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 
@@ -1901,19 +1906,19 @@ int CNN_PoolAct_fp16(
 		
 *********************************************************************************************************************************************************************/
 
-int CNN_GlobalPoolAct_fp16(
-			char *Name,
+static Kernel_T *CNN_GlobalPoolAct_fp16_Internal(
+	char *Name,
 
-			CNN_GenControl_T *Ctrl,
+	CNN_GenControl_T *Ctrl,
 
-                        int InFeat,
-                        int OutFeat,
-                        int Width,
-                        int Height,
+	int InFeat,
+	int OutFeat,
+	int Width,
+	int Height,
 
-			KernelOper_T PoolOper,
-			KernelOper_T ActOper
-			)
+	KernelOper_T PoolOper,
+	KernelOper_T ActOper
+	)
 
 {
 	Tile_Orientation_T TileOrientation = TILE_HOR;
@@ -2017,7 +2022,7 @@ int CNN_GlobalPoolAct_fp16(
 				  KernelOper,
 				  0, 0);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -2040,7 +2045,7 @@ int CNN_GlobalPoolAct_fp16(
 
 *********************************************************************************************************************************************************************/
 
-int CNN_Act_fp16(
+Kernel_T *CNN_Act_fp16_Internal(
         char *Name,
 
         CNN_GenControl_T *Ctrl,
@@ -2130,7 +2135,7 @@ int CNN_Act_fp16(
 				  ActOper,
 				  0, 0);
         }
-        return (Kernel!=0);
+        return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -2154,7 +2159,7 @@ int CNN_Act_fp16(
 	
 *********************************************************************************************************************************************************************/
 
-int CNN_LinearAct_fp16(
+Kernel_T *CNN_LinearAct_fp16_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -2360,7 +2365,7 @@ int CNN_LinearAct_fp16(
 				  0,
 				  (2==1)?7:15);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -2384,7 +2389,7 @@ int CNN_LinearAct_fp16(
 	
 *********************************************************************************************************************************************************************/
 
-int CNN_SoftMax_fp16(
+static Kernel_T *CNN_SoftMax_fp16_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -2448,10 +2453,10 @@ int CNN_SoftMax_fp16(
 				  KernelOper,
 				  0, 0);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
-int CNN_SoftMax2D_fp16(
+static Kernel_T *CNN_SoftMax2D_fp16_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -2517,7 +2522,7 @@ int CNN_SoftMax2D_fp16(
 				  KernelOper,
 				  0, 0);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 
@@ -2544,7 +2549,7 @@ int CNN_SoftMax2D_fp16(
 *********************************************************************************************************************************************************************/
 
 
-int CNN_MatAddAct_fp16(
+static Kernel_T *CNN_MatAddAct_fp16_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -2642,7 +2647,7 @@ int CNN_MatAddAct_fp16(
 				  KernelOper,
 				  0, 0);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -2684,7 +2689,7 @@ int CNN_MatAddPaddedAct_fp16(
 
 {
         if (PadBot == 0 && PadTop == 0) return CNN_MatAddAct_fp16(Name, Ctrl, Feat, Feat, Width, Height, AddMatOper, ActOper);
-        if (PadTop + PadBot > Feat) GenTilingError("int CNN_MatAddPaddedAct_SQ8 Kernel: %s, Padding exceeds channel size", Name);
+        if (PadTop + PadBot > Feat) GenTilingError("int CNN_MatAddPaddedAct_fp16 Kernel: %s, Padding exceeds channel size", Name);
         int FeatBody = Feat - PadTop - PadBot;
         int Ok = 1;
 
@@ -2799,7 +2804,7 @@ int CNN_MatAddPaddedAct_fp16(
 	
 *********************************************************************************************************************************************************************/
 
-int CNN_MatMulAct_fp16(
+Kernel_T *CNN_MatMulAct_fp16_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -2815,7 +2820,8 @@ int CNN_MatMulAct_fp16(
 	int Scy,
 
         KernelOper_T MatMulOper,
-        KernelOper_T ActOper
+        KernelOper_T ActOper,
+        int InvertInputs
 )
 
 {
@@ -2891,8 +2897,8 @@ int CNN_MatMulAct_fp16(
 		KernelIterSpace(2, IterTiledSpace(T1), IterTiledSpace(T0)),
                 TILE_HOR,
                 CArgs(4,
-                      TCArg(CNN_ArgDataTypeF(2,1,1),  "In2"),
-                      TCArg(CNN_ArgDataTypeF(2,1,1),  "In1"),
+                      TCArg(CNN_ArgDataTypeF(2,1,1),  InvertInputs?"In2":"In1"),
+                      TCArg(CNN_ArgDataTypeF(2,1,1),  InvertInputs?"In1":"In2"),
                       !NoBias?TCArg(CNN_ArgDataTypeF(2,1,1), "Bias"):AT_NO_C_ARG,
                       TCArg(CNN_ArgDataTypeF(2,1,1),  "Out")
                 ),
@@ -2959,7 +2965,7 @@ int CNN_MatMulAct_fp16(
 			  	KernelOper,
 			  	0, 0);
 	}
-	return (Kernel!=0);
+	return Kernel;
 }
 
 /*********************************************************************************************************************************************************************
@@ -2995,7 +3001,7 @@ int CNN_MatMulAct_fp16(
 	
 *********************************************************************************************************************************************************************/
 
-int CNN_MatMulSmallM1Act_fp16(
+Kernel_T *CNN_MatMulSmallM1Act_fp16_Internal(
 	char *Name,
 
 	CNN_GenControl_T *Ctrl,
@@ -3146,7 +3152,287 @@ int CNN_MatMulSmallM1Act_fp16(
 			  	KernelOper,
 			  	0, 0);
 	}
-	return (Kernel!=0);
+	return Kernel;
+
+}
+
+
+/* ============================================================================================================================================================== */
+/* ============================================================================================================================================================== */
+/* ============================================================================================================================================================== */
+/* ============================================================================================================================================================== */
+/* ============================================================================================================================================================== */
+/* ============================================================================================================================================================== */
+/* ============================================================================================================================================================== */
+
+int CNN_MM_ConvolutionPoolAct_fp16(
+	char         *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+       	int InFeat,
+       	int OutFeat,
+       	int Width,
+       	int Height,
+
+	KernelOper_T ConvOper,
+       	int Fcx,
+       	int Fcy,
+	int Dcx,
+	int Dcy,
+	int Scx,
+	int Scy,
+	int ConvPad,
+
+	KernelOper_T PoolOper,
+	int Fpx,
+	int Fpy,
+	int Dpx,
+	int Dpy,
+	int Spx,
+	int Spy,
+	int PoolPad,
+
+	KernelOper_T ActOper
+	)
+{
+	Kernel_T *Ker = 0, *Sol1 = 0, *Sol2 = 0;
+        float K = 0.9;
+        Tile_Orientation_T TileOrientation = TILE_HOR;
+        if (Ctrl) {
+		if (Ctrl->TileOrientation != -1) {
+			printf("TileOrientation set by user\n");
+			Ker = CNN_MM_ConvolutionPoolAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+			if (Ker!=0) return 1;
+			else GenTilingError("CNN_MM_ConvolutionPoolAct_fp16: %s, Failed to gen with set tiling orientation, try to let the Autotiler set it for you", Name);
+		}
+	}
+
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+
+	CNN_GenControl_T InternalCtrl;
+	if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl);
+    	else 	   InternalCtrl = *Ctrl;
+
+	printf("\n\n=============================== Trying Tile Orientation: TILE_HOR ===============================\n\n");
+    	CNN_SetGenCtrl(&InternalCtrl, "TILEORIENTATION", AT_OPT_VAL(0));
+        Ker = CNN_MM_ConvolutionPoolAct_fp16_Internal(Name, &InternalCtrl, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+        if (Ker) Sol1 = CopyAndPopUserKernel(Ker);
+
+	printf("\n=============================== Trying Tile Orientation: TILE_VER ===============================\n\n");
+    	CNN_SetGenCtrl(&InternalCtrl, "TILEORIENTATION", AT_OPT_VAL(1));
+        Ker = CNN_MM_ConvolutionPoolAct_fp16_Internal(Name, &InternalCtrl, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+        if (Ker) Sol2 = CopyAndPopUserKernel(Ker);
+
+        if (Sol1 && Sol2) {
+		int TakeSol1 = ((K*Sol1->Cost->TileOverhead) < Sol2->Cost->TileOverhead);  // K close to 1.0if (TakeSol1) {
+		printf(">>>>>>>>>>>>>>>>>> %s is better: %.3f vs %.3f \n\n\n", TakeSol1?"TILE_HOR":"TILE_VER", Sol1->Cost->TileOverhead, Sol2->Cost->TileOverhead);
+		if (TakeSol1) {
+                    PushBackUserKernel(Sol1); ReleaseUserKerne(Sol2);
+                } else {
+                    PushBackUserKernel(Sol2); ReleaseUserKerne(Sol1);
+                }
+        } else if (Sol1) {
+                PushBackUserKernel(Sol1); ReleaseUserKerne(Sol2);
+	} else if (Sol2) {
+                PushBackUserKernel(Sol2); ReleaseUserKerne(Sol1);
+	} else {
+		GenTilingError("Failed to Generate code for Kernel: %s", Name);
+	}
+        AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+        return 1;
+}
+
+int CNN_HWC_DWConvolutionPoolAct_fp16(
+	char         *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+       	int InFeat,
+       	int OutFeat,
+       	int Width,
+       	int Height,
+
+	KernelOper_T ConvOper,
+       	int Fcx,
+       	int Fcy,
+	int Dcx,
+	int Dcy,
+	int Scx,
+	int Scy,
+	int ConvPad,
+
+	KernelOper_T PoolOper,
+	int Fpx,
+	int Fpy,
+	int Dpx,
+	int Dpy,
+	int Spx,
+	int Spy,
+	int PoolPad,
+
+	KernelOper_T ActOper
+	)
+{
+	Kernel_T *Ker = 0, *Sol1 = 0, *Sol2 = 0;
+        float K = 0.9;
+        Tile_Orientation_T TileOrientation = TILE_HOR;
+        if (Ctrl) {
+		if (Ctrl->TileOrientation != -1) {
+			printf("TileOrientation set by user\n");
+			Ker = CNN_HWC_DWConvolutionPoolAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+			if (Ker!=0) return 1;
+			else GenTilingError("CNN_MM_ConvolutionPoolAct_fp16: %s, Failed to gen with set tiling orientation, try to let the Autotiler set it for you", Name);
+		}
+	}
+
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+
+	CNN_GenControl_T InternalCtrl;
+	if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl);
+    	else 	   InternalCtrl = *Ctrl;
+
+	printf("\n\n=============================== Trying Tile Orientation: TILE_HOR ===============================\n\n");
+    	CNN_SetGenCtrl(&InternalCtrl, "TILEORIENTATION", AT_OPT_VAL(0));
+        Ker = CNN_HWC_DWConvolutionPoolAct_fp16_Internal(Name, &InternalCtrl, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+        if (Ker) Sol1 = CopyAndPopUserKernel(Ker);
+
+	printf("\n=============================== Trying Tile Orientation: TILE_VER ===============================\n\n");
+    	CNN_SetGenCtrl(&InternalCtrl, "TILEORIENTATION", AT_OPT_VAL(1));
+        Ker = CNN_HWC_DWConvolutionPoolAct_fp16_Internal(Name, &InternalCtrl, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+        if (Ker) Sol2 = CopyAndPopUserKernel(Ker);
+
+        if (Sol1 && Sol2) {
+		int TakeSol1 = ((K*Sol1->Cost->TileOverhead) < Sol2->Cost->TileOverhead);  // K close to 1.0if (TakeSol1) {
+		printf(">>>>>>>>>>>>>>>>>> %s is better: %.3f vs %.3f \n\n\n", TakeSol1?"TILE_HOR":"TILE_VER", Sol1->Cost->TileOverhead, Sol2->Cost->TileOverhead);
+		if (TakeSol1) {
+                    PushBackUserKernel(Sol1); ReleaseUserKerne(Sol2);
+                } else {
+                    PushBackUserKernel(Sol2); ReleaseUserKerne(Sol1);
+                }
+        } else if (Sol1) {
+                PushBackUserKernel(Sol1); ReleaseUserKerne(Sol2);
+	} else if (Sol2) {
+                PushBackUserKernel(Sol2); ReleaseUserKerne(Sol1);
+	} else {
+		GenTilingError("Failed to Generate code for Kernel: %s", Name);
+	}
+        AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+        return 1;
+}
+
+int CNN_ConvolutionPoolAct_fp16(
+	char         *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+       	int InFeat,
+       	int OutFeat,
+       	int Width,
+       	int Height,
+
+	KernelOper_T ConvOper,
+       	int Fcx,
+       	int Fcy,
+	int Dcx,
+	int Dcy,
+	int Scx,
+	int Scy,
+	int ConvPad,
+
+	KernelOper_T PoolOper,
+	int Fpx,
+	int Fpy,
+	int Dpx,
+	int Dpy,
+	int Spx,
+	int Spy,
+	int PoolPad,
+
+	KernelOper_T ActOper
+	)
+{
+	Kernel_T *Ker = 0, *Sol1 = 0, *Sol2 = 0;
+        float K = 0.9;
+        Tile_Orientation_T TileOrientation = TILE_HOR;
+        if (Ctrl) {
+		if (Ctrl->TileOrientation != -1) {
+			printf("TileOrientation set by user\n");
+			Ker = CNN_ConvolutionPoolAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+			if (Ker!=0) return 1;
+			else GenTilingError("CNN_ConvolutionPoolAct_fp16: %s, Failed to gen with set tiling orientation, try to let the Autotiler set it for you", Name);
+		}
+	}
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+
+	CNN_GenControl_T InternalCtrl;
+	if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl);
+    	else 	   InternalCtrl = *Ctrl;
+
+	printf("\n\n=============================== Trying Tile Orientation: TILE_HOR ===============================\n\n");
+    	CNN_SetGenCtrl(&InternalCtrl, "TILEORIENTATION", AT_OPT_VAL(0));
+        Ker = CNN_ConvolutionPoolAct_fp16_Internal(Name, &InternalCtrl, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+        if (Ker) Sol1 = CopyAndPopUserKernel(Ker);
+
+	printf("\n=============================== Trying Tile Orientation: TILE_VER ===============================\n\n");
+    	CNN_SetGenCtrl(&InternalCtrl, "TILEORIENTATION", AT_OPT_VAL(1));
+        Ker = CNN_ConvolutionPoolAct_fp16_Internal(Name, &InternalCtrl, InFeat, OutFeat, Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+        if (Ker) Sol2 = CopyAndPopUserKernel(Ker);
+
+        if (Sol1 && Sol2) {
+		int TakeSol1 = ((K*Sol1->Cost->TileOverhead) < Sol2->Cost->TileOverhead);  // K close to 1.0if (TakeSol1) {
+		printf(">>>>>>>>>>>>>>>>>> %s is better: %.3f vs %.3f \n\n\n", TakeSol1?"TILE_HOR":"TILE_VER", Sol1->Cost->TileOverhead, Sol2->Cost->TileOverhead);
+		if (TakeSol1) {
+                    PushBackUserKernel(Sol1); ReleaseUserKerne(Sol2);
+                } else {
+                    PushBackUserKernel(Sol2); ReleaseUserKerne(Sol1);
+                }
+        } else if (Sol1) {
+                PushBackUserKernel(Sol1); ReleaseUserKerne(Sol2);
+	} else if (Sol2) {
+                PushBackUserKernel(Sol2); ReleaseUserKerne(Sol1);
+	} else {
+		GenTilingError("Failed to Generate code for Kernel: %s", Name);
+	}
+        AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+        return 1;
+}
+
+int CNN_PoolAct_fp16(char *Name, CNN_GenControl_T *Ctrl, int InFeat, int OutFeat, int Width, int Height, KernelOper_T PoolOper, int Fpx, int Fpy, int Dpx, int Dpy, int Spx, int Spy, int PoolPad, KernelOper_T ActOper)
+{
+	return (CNN_PoolAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, Width, Height, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper)!=0);
+}
+
+int CNN_Act_fp16(char *Name, CNN_GenControl_T *Ctrl, int Feat, int Width, int Height,KernelOper_T ActOper) {
+	return (CNN_Act_fp16_Internal(Name, Ctrl, Feat, Width, Height, ActOper)!=0);
+}
+
+int CNN_GlobalPoolAct_fp16(char *Name, CNN_GenControl_T *Ctrl, int InFeat, int OutFeat, int Width, int Height, KernelOper_T PoolOper, KernelOper_T ActOper) {
+	return (CNN_GlobalPoolAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, Width, Height, PoolOper, ActOper)!=0);
+}
+
+int CNN_LinearAct_fp16(char *Name, CNN_GenControl_T *Ctrl, int InDim, int OutDim, KernelOper_T LinearOper, KernelOper_T ActOper)
+{
+	return (CNN_LinearAct_fp16_Internal(Name, Ctrl, InDim, OutDim, LinearOper, ActOper)!=0);
+}
+
+int CNN_SoftMax_fp16(char *Name, CNN_GenControl_T *Ctrl, int Dim, KernelOper_T SoftMaxOper) {
+	return (CNN_SoftMax_fp16_Internal(Name, Ctrl, Dim, SoftMaxOper)!=0);
+}
+
+int CNN_SoftMax2D_fp16(char *Name, CNN_GenControl_T *Ctrl, int Dim, int N, KernelOper_T SoftMaxOper) {
+	return (CNN_SoftMax2D_fp16_Internal(Name, Ctrl, Dim, N, SoftMaxOper)!=0);
+}
+
+int CNN_MatAddAct_fp16(char *Name, CNN_GenControl_T *Ctrl, int InFeat, int OutFeat, int Width, int Height, KernelOper_T AddMatOper, KernelOper_T ActOper) {
+	return (CNN_MatAddAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, Width, Height, AddMatOper, ActOper)!=0);
+}
+
+int CNN_MatMulAct_fp16(char *Name, CNN_GenControl_T *Ctrl, int ColM1, int LineM1, int ColM2, int LineM2, int Width, int Height, int Scx, int Scy, KernelOper_T MatMulOper, KernelOper_T ActOper) {
+	return (CNN_MatMulAct_fp16_Internal(Name, Ctrl, ColM1, LineM1, ColM2, LineM2, Width, Height, Scx, Scy, MatMulOper, ActOper, 1)!=0);
+}
 
+int CNN_MatMulSmallM1Act_fp16(char *Name, CNN_GenControl_T *Ctrl, int ColM1, int LineM1, int ColM2, int LineM2, int Width, int Height, int Scx, int Scy, KernelOper_T MatMulOper, KernelOper_T ActOper) {
+	return (CNN_MatMulSmallM1Act_fp16_Internal(Name, Ctrl, ColM1, LineM1, ColM2, LineM2, Width, Height, Scx, Scy, MatMulOper, ActOper)!=0);
 }
 
diff --git a/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.h b/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.h
index 5e942f2b6..7096cbc26 100644
--- a/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.h
+++ b/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.h
@@ -67,6 +67,99 @@ extern void LoadCNNLibrary_fp16();
 
  */
 
+Kernel_T *CNN_MM_ConvolutionPoolAct_fp16_Internal(
+        char         *Name,
+
+        CNN_GenControl_T *Ctrl,
+
+        int InFeat,
+        int OutFeat,
+        int Width,
+        int Height,
+
+        KernelOper_T ConvOper,
+        int Fcx,
+        int Fcy,
+        int Dcx,
+        int Dcy,
+        int Scx,
+        int Scy,
+        int ConvPad,
+
+        KernelOper_T PoolOper,
+        int Fpx,
+        int Fpy,
+        int Dpx,
+        int Dpy,
+        int Spx,
+        int Spy,
+        int PoolPad,
+
+        KernelOper_T ActOper
+        );
+
+Kernel_T *CNN_HWC_DWConvolutionPoolAct_fp16_Internal(
+        char         *Name,
+
+        CNN_GenControl_T *Ctrl,
+
+        int InFeat,
+        int OutFeat,
+        int Width,
+        int Height,
+
+        KernelOper_T ConvOper,
+        int Fcx,
+        int Fcy,
+        int Dcx,
+        int Dcy,
+        int Scx,
+        int Scy,
+        int ConvPad,
+
+        KernelOper_T PoolOper,
+        int Fpx,
+        int Fpy,
+        int Dpx,
+        int Dpy,
+        int Spx,
+        int Spy,
+        int PoolPad,
+
+        KernelOper_T ActOper
+        );
+
+Kernel_T *CNN_ConvolutionPoolAct_fp16_Internal(
+        char         *Name,
+
+        CNN_GenControl_T *Ctrl,
+
+        int InFeat,
+        int OutFeat,
+        int Width,
+        int Height,
+
+        KernelOper_T ConvOper,
+        int Fcx,
+        int Fcy,
+        int Dcx,
+        int Dcy,
+        int Scx,
+        int Scy,
+        int ConvPad,
+
+        KernelOper_T PoolOper,
+        int Fpx,
+        int Fpy,
+        int Dpx,
+        int Dpy,
+        int Spx,
+        int Spy,
+        int PoolPad,
+
+        KernelOper_T ActOper
+        );
+
 extern int CNN_ConvolutionPoolAct_fp16(
 	char         *Name,
 
@@ -84,7 +177,7 @@ extern int CNN_ConvolutionPoolAct_fp16(
 	int Dcy,
 	int Scx,
 	int Scy,
-	int          ConvPad,
+	int ConvPad,
 
 	KernelOper_T PoolOper,
 	int Fpx,
@@ -93,7 +186,7 @@ extern int CNN_ConvolutionPoolAct_fp16(
 	int Dpy,
 	int Spx,
 	int Spy,
-	int          PoolPad,
+	int PoolPad,
 
        	KernelOper_T ActOper
 	);
@@ -200,6 +293,28 @@ extern int CNN_GroupedConvolutionPoolAct_fp16(
 
  */
 
+Kernel_T *CNN_PoolAct_fp16_Internal(
+        char *Name,
+
+        CNN_GenControl_T *Ctrl,
+
+        int InFeat,
+        int OutFeat,
+        int Width,
+        int Height,
+
+        KernelOper_T PoolOper,
+        int Fpx,
+        int Fpy,
+        int Dpx,
+        int Dpy,
+        int Spx,
+        int Spy,
+        int PoolPad,
+
+        KernelOper_T ActOper
+        );
+
 extern int CNN_PoolAct_fp16(
 	char         *Name,
 
@@ -276,6 +391,18 @@ extern int CNN_GlobalPoolAct_fp16(
 
 *********************************************************************************************************************************************************************/
 
+Kernel_T *CNN_Act_fp16_Internal(
+        char *Name,
+
+        CNN_GenControl_T *Ctrl,
+
+        int Feat,
+        int Width,
+        int Height,
+
+        KernelOper_T ActOper
+        );
+
 extern int CNN_Act_fp16(
         char *Name,
 
@@ -309,6 +436,18 @@ extern int CNN_Act_fp16(
 
 */
 
+Kernel_T *CNN_LinearAct_fp16_Internal(
+        char *Name,
+
+        CNN_GenControl_T *Ctrl,
+
+        int InDim,
+        int OutDim,
+
+        KernelOper_T LinearOper,
+        KernelOper_T ActOper
+        );
+
 extern int CNN_LinearAct_fp16(
         char *Name,
 
@@ -456,7 +595,27 @@ extern int CNN_MatAddPaddedAct_fp16(
 
     \param    Signature:      Name(In2, In1, Bias, Out)
 */
-        
+
+Kernel_T *CNN_MatMulAct_fp16_Internal(
+        char *Name,
+
+        CNN_GenControl_T *Ctrl,
+
+        int ColM1,
+        int LineM1,
+        int ColM2,
+        int LineM2,
+
+        int Width,
+        int Height,
+        int Scx,
+        int Scy,
+
+        KernelOper_T MatMulOper,
+        KernelOper_T ActOper,
+        int InvertInputs
+        );
+
 extern int CNN_MatMulAct_fp16(
         char *Name,
 
@@ -509,6 +668,25 @@ extern int CNN_MatMulAct_fp16(
 
 *********************************************************************************************************************************************************************/
 
+Kernel_T *CNN_MatMulSmallM1Act_fp16_Internal(
+        char *Name,
+
+        CNN_GenControl_T *Ctrl,
+
+        int ColM1,
+        int LineM1,
+        int ColM2,
+        int LineM2,
+
+        int Width,
+        int Height,
+        int Scx,
+        int Scy,
+
+        KernelOper_T MatMulOper,
+        KernelOper_T ActOper
+);
+
 extern int CNN_MatMulSmallM1Act_fp16(
         char *Name,
 
diff --git a/tools/autotiler_v3/CNN_Libraries/CNN_Copy.c b/tools/autotiler_v3/CNN_Libraries/CNN_Copy.c
index 3ccdeccf2..8b7b62ecd 100644
--- a/tools/autotiler_v3/CNN_Libraries/CNN_Copy.c
+++ b/tools/autotiler_v3/CNN_Libraries/CNN_Copy.c
@@ -39,10 +39,15 @@ float to_float(unsigned short v) {
 #endif
 #else
 #ifndef TO_FLOAT
-#define TO_FLOAT(x) *((F16 *)&x)
+#define TO_FLOAT(x) *((F16 *) (&x))
 #endif
 #endif
 
+typedef union {
+	float F;
+	int I;
+} f32_cast_t;
+
 static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X)
 
 {
@@ -1706,8 +1711,13 @@ void CNN_FpFloat16(CNN_FpFloat16_T * Arg)
 	unsigned int Iter = Max(0, Last-First);
 	signed short *pIn = (signed short *) (Arg->In + First);
 	F16V *pOut = (F16V *) (Arg->Out + First);
-	F16 zero_diff = TO_FLOAT(Arg->Infos[AT_INF_QUANT_ZERO_DIFF]);
-	F16 scale = TO_FLOAT(Arg->Infos[AT_INF_QUANT_SCALE]);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	F16 zero_diff = (F16) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	F16 scale = (F16) caster.F;
+
 	F16V scale_v = gap_pack2f16(scale, scale);
 	F16V zero_v = gap_pack2f16(zero_diff, zero_diff);
 
@@ -1731,8 +1741,13 @@ void CNN_UFpFloat16(CNN_UFpFloat16_T * Arg)
 	unsigned int Iter = Max(0, Last-First);
 	unsigned short *pIn = (unsigned short *) (Arg->In + First);
 	F16V *pOut = (F16V *) (Arg->Out + First);
-	F16 zero_diff = TO_FLOAT(Arg->Infos[AT_INF_QUANT_ZERO_DIFF]);
-	F16 scale = TO_FLOAT(Arg->Infos[AT_INF_QUANT_SCALE]);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	F16 zero_diff = (F16) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	F16 scale = (F16) caster.F;
+
 	F16V scale_v = gap_pack2f16(scale, scale);
 	F16V zero_v = gap_pack2f16(zero_diff, zero_diff);
 
@@ -1756,8 +1771,13 @@ void CNN_FpsFloat16(CNN_FpsFloat16_T * Arg)
 	unsigned int Iter = Max(0, Last-First);
 	signed char *pIn = (signed char *) (Arg->In + First);
 	F16V *pOut = (F16V *) (Arg->Out + First);
-	F16 zero_diff = TO_FLOAT(Arg->Infos[AT_INF_QUANT_ZERO_DIFF]);
-	F16 scale = TO_FLOAT(Arg->Infos[AT_INF_QUANT_SCALE]);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	F16 zero_diff = (F16) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	F16 scale = (F16) caster.F;
+
 	F16V scale_v = gap_pack2f16(scale, scale);
 	F16V zero_v = gap_pack2f16(zero_diff, zero_diff);
 
@@ -1781,8 +1801,13 @@ void CNN_UFpsFloat16(CNN_UFpsFloat16_T * Arg)
 	unsigned int Iter = Max(0, Last-First);
 	unsigned char *pIn = (signed char *) (Arg->In + First);
 	F16V *pOut = (F16V *) (Arg->Out + First);
-	F16 zero_diff = TO_FLOAT(Arg->Infos[AT_INF_QUANT_ZERO_DIFF]);
-	F16 scale = TO_FLOAT(Arg->Infos[AT_INF_QUANT_SCALE]);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	F16 zero_diff = (F16) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	F16 scale = (F16) caster.F;
+
 	F16V scale_v = gap_pack2f16(scale, scale);
 	F16V zero_v = gap_pack2f16(zero_diff, zero_diff);
 
@@ -1806,8 +1831,13 @@ void CNN_Float16Fp(CNN_Float16Fp_T * Arg)
 	unsigned int Iter = Max(0, Last-First);
 	F16V *pIn = (F16V *) (Arg->In + First);
 	v2s *pOut = (v2s *) (Arg->Out + First);
-	F16 zero_diff = TO_FLOAT(Arg->Infos[AT_INF_QUANT_ZERO_DIFF]);
-	F16 scale = TO_FLOAT(Arg->Infos[AT_INF_QUANT_SCALE]);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	F16 zero_diff = (F16) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	F16 scale = (F16) caster.F;
+
 	F16V scale_v = gap_pack2f16(scale, scale);
 	F16V zero_v = gap_pack2f16(zero_diff, zero_diff);
 	F16V max_v = gap_pack2f16(32767.0F, 32767.0F);
@@ -1836,8 +1866,13 @@ void CNN_Float16UFp(CNN_Float16UFp_T * Arg)
 	unsigned int Iter = Max(0, Last-First);
 	F16V *pIn = (F16V *) (Arg->In + First);
 	v2u *pOut = (v2u *) (Arg->Out + First);
-	F16 zero_diff = TO_FLOAT(Arg->Infos[AT_INF_QUANT_ZERO_DIFF]);
-	F16 scale = TO_FLOAT(Arg->Infos[AT_INF_QUANT_SCALE]);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	F16 zero_diff = (F16) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	F16 scale = (F16) caster.F;
+
 	F16V scale_v = gap_pack2f16(scale, scale);
 	F16V zero_v = gap_pack2f16(zero_diff, zero_diff);
 	F16V max_v = gap_pack2f16(65535.0F, 65535.0F);
@@ -1866,8 +1901,13 @@ void CNN_Float16Fps(CNN_Float16Fps_T * Arg)
 	unsigned int Iter = Max(0, Last-First);
 	F16V *pIn = (F16V *) (Arg->In + First);
 	v4s *pOut = (v4s *) (Arg->Out + First);
-	F16 zero_diff = TO_FLOAT(Arg->Infos[AT_INF_QUANT_ZERO_DIFF]);
-	F16 scale = TO_FLOAT(Arg->Infos[AT_INF_QUANT_SCALE]);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	F16 zero_diff = (F16) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	F16 scale = (F16) caster.F;
+
 	F16V scale_v = gap_pack2f16(scale, scale);
 	F16V zero_v = gap_pack2f16(zero_diff, zero_diff);
 	F16V max_v = gap_pack2f16(127.0F, 127.0F);
@@ -1903,8 +1943,13 @@ void CNN_Float16UFps(CNN_Float16UFps_T * Arg)
 	unsigned int Iter = Max(0, Last-First);
 	F16V *pIn = (F16V *) (Arg->In + First);
 	v4u *pOut = (v4u *) (Arg->Out + First);
-	F16 zero_diff = TO_FLOAT(Arg->Infos[AT_INF_QUANT_ZERO_DIFF]);
-	F16 scale = TO_FLOAT(Arg->Infos[AT_INF_QUANT_SCALE]);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	F16 zero_diff = (F16) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	F16 scale = (F16) caster.F;
+
 	F16V scale_v = gap_pack2f16(scale, scale);
 	F16V zero_v = gap_pack2f16(zero_diff, zero_diff);
 	F16V max_v = gap_pack2f16(255.0F, 255.0F);
@@ -1932,4 +1977,171 @@ void CNN_Float16UFps(CNN_Float16UFps_T * Arg)
 	gap_waitbarrier(0);
 }
 
+void CNN_FpFloat32(CNN_FpFloat32_T * Arg)
+{
+	unsigned int Size = Arg->W * Arg->H;
+	unsigned int CoreId = gap_coreid();
+	unsigned int Chunk = ChunkSize(Size), First = Chunk*CoreId, Last = Min(First+Chunk, Size);
+	unsigned int Iter = Max(0, Last-First);
+	signed short *pIn = (signed short *) (Arg->In);
+	float *pOut = (float *) (Arg->Out);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	float zero_diff = (float) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	float scale = (float) caster.F;
+
+	for (int i=First; i<Last; i++) {
+		pOut[i] = (pIn[i] + zero_diff) * scale;
+	}
+	gap_waitbarrier(0);
+}
+
+void CNN_UFpFloat32(CNN_UFpFloat32_T * Arg)
+{
+	unsigned int Size = Arg->W * Arg->H;
+	unsigned int CoreId = gap_coreid();
+	unsigned int Chunk = ChunkSize(Size), First = Chunk*CoreId, Last = Min(First+Chunk, Size);
+	unsigned int Iter = Max(0, Last-First);
+	unsigned short *pIn = (unsigned short *) (Arg->In);
+	float *pOut = (float *) (Arg->Out);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	float zero_diff = (float) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	float scale = (float) caster.F;
+
+	for (int i=First; i<Last; i++) {
+		pOut[i] = (pIn[i] + zero_diff) * scale;
+	}
+	gap_waitbarrier(0);
+}
+
+void CNN_FpsFloat32(CNN_FpsFloat32_T * Arg)
+{
+	unsigned int Size = Arg->W * Arg->H;
+	unsigned int CoreId = gap_coreid();
+	unsigned int Chunk = ChunkSize(Size), First = Chunk*CoreId, Last = Min(First+Chunk, Size);
+	unsigned int Iter = Max(0, Last-First);
+	signed char *pIn = (signed char *) (Arg->In);
+	float *pOut = (float *) (Arg->Out);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	float zero_diff = (float) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	float scale = (float) caster.F;
+
+	for (int i=First; i<Last; i++) {
+		pOut[i] = (pIn[i] + zero_diff) * scale;
+	}
+	gap_waitbarrier(0);
+}
+
+void CNN_UFpsFloat32(CNN_UFpsFloat32_T * Arg)
+{
+	unsigned int Size = Arg->W * Arg->H;
+	unsigned int CoreId = gap_coreid();
+	unsigned int Chunk = ChunkSize(Size), First = Chunk*CoreId, Last = Min(First+Chunk, Size);
+	unsigned int Iter = Max(0, Last-First);
+	unsigned char *pIn = (signed char *) (Arg->In);
+	float *pOut = (float *) (Arg->Out);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	float zero_diff = (float) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	float scale = (float) caster.F;
+
+	for (int i=First; i<Last; i++) {
+		pOut[i] = (pIn[i] + zero_diff) * scale;
+	}
+	gap_waitbarrier(0);
+}
+
+void CNN_Float32Fp(CNN_Float32Fp_T * Arg)
+{
+	unsigned int Size = Arg->W * Arg->H;
+	unsigned int CoreId = gap_coreid();
+	unsigned int Chunk = ChunkSize(Size), First = Chunk*CoreId, Last = Min(First+Chunk, Size);
+	unsigned int Iter = Max(0, Last-First);
+	float *pIn = (float *) (Arg->In);
+	signed short int *pOut = (short int *) (Arg->Out);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	float zero_diff = (float) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	float scale = (float) caster.F;
+
+	for (int i=First; i<Last; i++) {
+		pOut[i] = (signed short int) Clipf32((pIn[i] + zero_diff) * scale, 32767.0F, -32768.0F);
+	}
+	gap_waitbarrier(0);
+}
+
+void CNN_Float32UFp(CNN_Float32UFp_T * Arg)
+{
+	unsigned int Size = Arg->W * Arg->H;
+	unsigned int CoreId = gap_coreid();
+	unsigned int Chunk = ChunkSize(Size), First = Chunk*CoreId, Last = Min(First+Chunk, Size);
+	unsigned int Iter = Max(0, Last-First);
+	float *pIn = (float *) (Arg->In);
+	unsigned short int *pOut = (unsigned short int *) (Arg->Out);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	float zero_diff = (float) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	float scale = (float) caster.F;
+
+	for (int i=First; i<Last; i++) {
+		pOut[i] = (unsigned short int) Clipf32((pIn[i] + zero_diff) * scale, 65535.0F, 0.0F);
+	}
+	gap_waitbarrier(0);
+}
+
+void CNN_Float32Fps(CNN_Float32Fps_T * Arg)
+{
+	unsigned int Size = Arg->W * Arg->H;
+	unsigned int CoreId = gap_coreid();
+	unsigned int Chunk = ChunkSize(Size), First = Chunk*CoreId, Last = Min(First+Chunk, Size);
+	unsigned int Iter = Max(0, Last-First);
+	float *pIn = (float *) (Arg->In);
+	signed char *pOut = (signed char *) (Arg->Out);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	float zero_diff = (float) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	float scale = (float) caster.F;
+
+	for (int i=First; i<Last; i++) {
+		pOut[i] = (signed char) Clipf32((pIn[i] + zero_diff) * scale, 127.0F, -128.0F);
+	}
+	gap_waitbarrier(0);
+}
+
+void CNN_Float32UFps(CNN_Float32UFps_T * Arg)
+{
+	unsigned int Size = Arg->W * Arg->H;
+	unsigned int CoreId = gap_coreid();
+	unsigned int Chunk = ChunkSize(Size), First = Chunk*CoreId, Last = Min(First+Chunk, Size);
+	unsigned int Iter = Max(0, Last-First);
+	float *pIn = (float *) (Arg->In);
+	unsigned char *pOut = (unsigned char *) (Arg->Out);
+
+	f32_cast_t caster;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_ZERO_DIFF));
+	float zero_diff = (float) caster.F;
+	caster.I = *((int *) (Arg->Infos + AT_INF_QUANT_SCALE));
+	float scale = (float) caster.F;
+
+	for (int i=First; i<Last; i++) {
+		pOut[i] = (unsigned char) Clipf32((pIn[i] + zero_diff) * scale, 255.0F, 0.0F);
+	}
+}
+
 #pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries/CNN_Copy.h b/tools/autotiler_v3/CNN_Libraries/CNN_Copy.h
index 7ce10a20a..a53a8e0ce 100644
--- a/tools/autotiler_v3/CNN_Libraries/CNN_Copy.h
+++ b/tools/autotiler_v3/CNN_Libraries/CNN_Copy.h
@@ -310,6 +310,78 @@ typedef struct {
     signed char *__restrict__ Infos;
 } CNN_UFpsFloat16_T;
 
+// float32 -> signed short
+typedef struct {
+	float *__restrict__ In;
+	short int *__restrict__ Out;
+	unsigned short int W;
+	unsigned short int H;
+    signed char *__restrict__ Infos;
+} CNN_Float32Fp_T;
+
+// float32 -> unsigned short
+typedef struct {
+	float *__restrict__ In;
+	unsigned short int *__restrict__ Out;
+	unsigned short int W;
+	unsigned short int H;
+    signed char *__restrict__ Infos;
+} CNN_Float32UFp_T;
+
+// float32 -> signed char
+typedef struct {
+	float *__restrict__ In;
+	signed char *__restrict__ Out;
+	unsigned short int W;
+	unsigned short int H;
+    signed char *__restrict__ Infos;
+} CNN_Float32Fps_T;
+
+// float32 -> unsigned char
+typedef struct {
+	float *__restrict__ In;
+	unsigned char *__restrict__ Out;
+	unsigned short int W;
+	unsigned short int H;
+    signed char *__restrict__ Infos;
+} CNN_Float32UFps_T;
+
+// signed short -> float32
+typedef struct {
+	short int *__restrict__ In;
+	float *__restrict__ Out;
+	unsigned short int W;
+	unsigned short int H;
+    signed char *__restrict__ Infos;
+} CNN_FpFloat32_T;
+
+// unsigned short -> float32
+typedef struct {
+	unsigned short int *__restrict__ In;
+	float *__restrict__ Out;
+	unsigned short int W;
+	unsigned short int H;
+    signed char *__restrict__ Infos;
+} CNN_UFpFloat32_T;
+
+// signed char -> float32
+typedef struct {
+	signed char *__restrict__ In;
+	float *__restrict__ Out;
+	unsigned short int W;
+	unsigned short int H;
+    signed char *__restrict__ Infos;
+} CNN_FpsFloat32_T;
+
+// unsigned char -> float32
+typedef struct {
+	unsigned char *__restrict__ In;
+	float *__restrict__ Out;
+	unsigned short int W;
+	unsigned short int H;
+    signed char *__restrict__ Infos;
+} CNN_UFpsFloat32_T;
+
 #define AT_INF_QUANT_ZERO_DIFF		0
 #define AT_INF_QUANT_SCALE			4
 #define AT_INF_QUANT_NORM			6
@@ -405,5 +477,15 @@ extern void CNN_UFpFloat16(CNN_UFpFloat16_T * Arg);
 extern void CNN_FpsFloat16(CNN_FpsFloat16_T * Arg);
 extern void CNN_UFpsFloat16(CNN_UFpsFloat16_T * Arg);
 
+extern void CNN_FpFloat32(CNN_FpFloat32_T * Arg);
+extern void CNN_UFpFloat32(CNN_UFpFloat32_T * Arg);
+extern void CNN_FpsFloat32(CNN_FpsFloat32_T * Arg);
+extern void CNN_UFpsFloat32(CNN_UFpsFloat32_T * Arg);
+
+extern void CNN_Float32Fp(CNN_Float32Fp_T * Arg);
+extern void CNN_Float32UFp(CNN_Float32UFp_T * Arg);
+extern void CNN_Float32Fps(CNN_Float32Fps_T * Arg);
+extern void CNN_Float32UFps(CNN_Float32UFps_T * Arg);
+
 #endif
 
diff --git a/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.c b/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.c
index e8d39e821..1c302417a 100644
--- a/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.c
+++ b/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.c
@@ -244,7 +244,7 @@ void NE16_ComputeBorders(KerConv_NE16_T *Arg, int Wo_F, int Wo_L, int Wo, int Ho
 
 	int PadL = Arg->Pad[0], PadR = Arg->Pad[1], PadT = Arg->Pad[2], PadB = Arg->Pad[3];
 	int Nb_KI	= Tile_InFeat/16 + (Tile_InFeat%16?1:0);
-	int Nb_LoadedKI = Arg->TotalInFeatures/16 + (Arg->TotalInFeatures%16?1:0);
+	int Nb_LoadedKI = Arg->TotalInFeatures/(Mode16?8:16) + (Arg->TotalInFeatures%(Mode16?8:16)?1:0);
 	int Rem_KI	= Tile_InFeat%16?Tile_InFeat%16:16;
 	int Nb_KO	= Tile_OutFeat/32 + (Tile_OutFeat%32?1:0);
 	int Rem_KO	= Tile_OutFeat%32?Tile_OutFeat%32:32;
@@ -289,7 +289,7 @@ void NE16_ComputeBorders(KerConv_NE16_T *Arg, int Wo_F, int Wo_L, int Wo, int Ho
 					SetNE16_InPointer     ((void *) Max((unsigned int) In, (unsigned int) InPointer + Tile_InFeat*(subfilter_i*Dx + w*Sx-PadL + Tile_InW*(Ho_F*Sy-PadT + subfilter_j*Dy))));
 					SetNE16_OutPointer    (Out + Tile_OutFeat*(w + Ho_F*Tile_OutW));
 					// TODO - checkme I think here you need the total number of loaded chin
-					SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*Fx*subfilter_j);
+					SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*Fx*subfilter_j);
 					SetNE16_Reminders     (1, Rem_H_Subtiles, Rem_KI, Rem_KO, 1, Rem_H_Subtiles);
 					SetNE16_Dim           (Nb_KI, Nb_KO, 1, H_SubTiles);
 					SetNE16_GenConfig     (Gen_Cfg);
@@ -332,7 +332,7 @@ void NE16_ComputeBorders(KerConv_NE16_T *Arg, int Wo_F, int Wo_L, int Wo, int Ho
 					SetNE16_InPointer     (InPointer + Tile_InFeat*(w*Sx-PadL + subfilter_i*Dx + Tile_InW*(Ho_F*Sy-PadT + subfilter_j*Dy)));
 					SetNE16_OutPointer    (Out + Tile_OutFeat*(w + Ho_F*Tile_OutW));
 					// TODO - checkme I think here you need the total number of loaded chin
-					SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*Fx*subfilter_j);
+					SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*Fx*subfilter_j);
 					SetNE16_Reminders     (1, Rem_H_Subtiles, Rem_KI, Rem_KO, 1, Rem_H_Subtiles);
 					SetNE16_Dim           (Nb_KI, Nb_KO, 1, H_SubTiles);
 					SetNE16_GenConfig     (Gen_Cfg);
@@ -375,7 +375,7 @@ void NE16_ComputeBorders(KerConv_NE16_T *Arg, int Wo_F, int Wo_L, int Wo, int Ho
 					SetNE16_InPointer     ((void *) Max((unsigned int) InPointer, (unsigned int) InPointer + Tile_InFeat*(Wo_F*Sx-PadL + subfilter_i*Dx + (subfilter_j*Dy+h*Sy-PadT)*Tile_InW)));
 					SetNE16_OutPointer    (Out + Tile_OutFeat*(Wo_F + h*Tile_OutW));
 					// TODO - checkme I think here you need the total number of loaded chin
-					SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*Fx*subfilter_j);
+					SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*Fx*subfilter_j);
 					SetNE16_Reminders     (Rem_W_Subtiles, 1, Rem_KI, Rem_KO, Rem_W_Subtiles, 1);
 					SetNE16_Dim           (Nb_KI, Nb_KO, W_SubTiles, 1);
 					SetNE16_GenConfig     (Gen_Cfg);
@@ -418,7 +418,7 @@ void NE16_ComputeBorders(KerConv_NE16_T *Arg, int Wo_F, int Wo_L, int Wo, int Ho
 					SetNE16_InPointer     (InPointer + Tile_InFeat*(Wo_F*Sx-PadL + subfilter_i*Dx + (subfilter_j*Dy+h*Sy-PadT)*Tile_InW));
 					SetNE16_OutPointer    (Out + Tile_OutFeat*(Wo_F + h*Tile_OutW));
 					// TODO - checkme I think here you need the total number of loaded chin
-					SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*Fx*subfilter_j);
+					SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*Fx*subfilter_j);
 					SetNE16_Reminders     (Rem_W_Subtiles, 1, Rem_KI, Rem_KO, Rem_W_Subtiles, 1);
 					SetNE16_Dim           (Nb_KI, Nb_KO, W_SubTiles, 1);
 					SetNE16_GenConfig     (Gen_Cfg);
@@ -463,7 +463,7 @@ void NE16_ComputeBorders(KerConv_NE16_T *Arg, int Wo_F, int Wo_L, int Wo, int Ho
 						SetNE16_InPointer     ((void *) Max((unsigned int) In, (unsigned int) InPointer + Tile_InFeat*(subfilter_i*Dx + w*Sx - PadL + (subfilter_j*Dy+h*Sy-PadT)*Tile_InW)));
 						SetNE16_OutPointer    (Out + Tile_OutFeat*(w + h*Tile_OutW));
 						// TODO - checkme I think here you need the total number of loaded chin
-						SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*Fx*subfilter_j);
+						SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*Fx*subfilter_j);
 						SetNE16_Reminders     (1, 1, Rem_KI, Rem_KO, 1, 1);
 						SetNE16_Dim           (Nb_KI, Nb_KO, 1, 1);
 						SetNE16_GenConfig     (Gen_Cfg);
@@ -509,7 +509,7 @@ void NE16_ComputeBorders(KerConv_NE16_T *Arg, int Wo_F, int Wo_L, int Wo, int Ho
 						SetNE16_InPointer     ((void *) Max((unsigned int) In, (unsigned int) InPointer + Tile_InFeat*(subfilter_i*Dx + (w*Sx-PadL) + (subfilter_j*Dy+h*Sy-PadT)*Tile_InW)));
 						SetNE16_OutPointer    (Out + Tile_OutFeat*(w + h*Tile_OutW));
 						// TODO - checkme I think here you need the total number of loaded chin
-						SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*Fx*subfilter_j);
+						SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*Fx*subfilter_j);
 						SetNE16_Reminders     (1, 1, Rem_KI, Rem_KO, 1, 1);
 						SetNE16_Dim           (Nb_KI, Nb_KO, 1, 1);
 						SetNE16_GenConfig     (Gen_Cfg);
@@ -555,7 +555,7 @@ void NE16_ComputeBorders(KerConv_NE16_T *Arg, int Wo_F, int Wo_L, int Wo, int Ho
 						SetNE16_InPointer     ((void *) Max((unsigned int) In, (unsigned int) InPointer + Tile_InFeat*(subfilter_i*Dx + w*Sx - PadL + (subfilter_j*Dy+h*Sy-PadT)*Tile_InW)));
 						SetNE16_OutPointer    (Out + Tile_OutFeat*(h*Tile_OutW + w));
 						// TODO - checkme I think here you need the total number of loaded chin
-						SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*Fx*subfilter_j);
+						SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*Fx*subfilter_j);
 						SetNE16_Reminders     (1, 1, Rem_KI, Rem_KO, 1, 1);
 						SetNE16_Dim           (Nb_KI, Nb_KO, 1, 1);
 						SetNE16_GenConfig     (Gen_Cfg);
@@ -601,7 +601,7 @@ void NE16_ComputeBorders(KerConv_NE16_T *Arg, int Wo_F, int Wo_L, int Wo, int Ho
 						SetNE16_InPointer     (InPointer + Tile_InFeat*(subfilter_i*Dx + w*Sx-PadL + (subfilter_j*Dy+h*Sy-PadT)*Tile_InW));
 						SetNE16_OutPointer    (Out + Tile_OutFeat*(h*Tile_OutW + w));
 						// TODO - checkme I think here you need the total number of loaded chin
-						SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*Fx*subfilter_j);
+						SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*subfilter_i + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*Fx*subfilter_j);
 						SetNE16_Reminders     (1, 1, Rem_KI, Rem_KO, 1, 1);
 						SetNE16_Dim           (Nb_KI, Nb_KO, 1, 1);
 						SetNE16_GenConfig     (Gen_Cfg);
@@ -1034,14 +1034,6 @@ void KerConv3x3Stride2_NE16(KerConv_NE16_T *Arg)
 	SetNE16_WOffset       (Arg->W_Offset);
 	SetNE16_GenConfig     (Gen_Cfg);
 
-	subtile_j_major ++;
-	if(subtile_j_major==W_subtiles) {
-		subtile_j_major = 0;
-		subtile_i_major ++;
-		IsLastSubtileH = subtile_i_major>=(H_subtiles-1);
-	}
-	IsLastSubtileW = subtile_j_major==(W_subtiles-1);
-
 	// already commit and trigger NE16 computation, while programming the next one
 	NE16_WRITE_CMD(NE16_COMMIT_AND_TRIGGER, NE16_TRIGGER_CMD);
 	if (IsLastSubtileH && IsLastSubtileW) {
@@ -1052,6 +1044,14 @@ void KerConv3x3Stride2_NE16(KerConv_NE16_T *Arg)
 		return;
 	}
 
+	subtile_j_major ++;
+	if(subtile_j_major==W_subtiles) {
+		subtile_j_major = 0;
+		subtile_i_major ++;
+		IsLastSubtileH = subtile_i_major>=(H_subtiles-1);
+	}
+	IsLastSubtileW = subtile_j_major==(W_subtiles-1);
+
 	// acquire second job
 	NE16_BARRIER_ACQUIRE(job_id);
 
@@ -1104,14 +1104,6 @@ void KerConv3x3Stride2_NE16(KerConv_NE16_T *Arg)
 	SetNE16_WOffset       (Arg->W_Offset);
 	SetNE16_GenConfig     (Gen_Cfg);
 
-	subtile_j_major ++;
-	if(subtile_j_major==W_subtiles) {
-		subtile_j_major = 0;
-		subtile_i_major ++;
-		IsLastSubtileH = subtile_i_major>=(H_subtiles-1);
-	}
-	IsLastSubtileW = subtile_j_major==(W_subtiles-1);
-
 	// already commit and trigger NE16 computation, while programming the next one
 	NE16_WRITE_CMD(NE16_COMMIT_AND_TRIGGER, NE16_TRIGGER_CMD);
 	if (IsLastSubtileH && IsLastSubtileW) {
@@ -1122,6 +1114,14 @@ void KerConv3x3Stride2_NE16(KerConv_NE16_T *Arg)
 		return;
 	}
 
+	subtile_j_major ++;
+	if(subtile_j_major==W_subtiles) {
+		subtile_j_major = 0;
+		subtile_i_major ++;
+		IsLastSubtileH = subtile_i_major>=(H_subtiles-1);
+	}
+	IsLastSubtileW = subtile_j_major==(W_subtiles-1);
+
 	// main iteration strided conv iteration loop (does not need ne16_c0_config any more)
 	do {
                 // acquire job
@@ -1283,14 +1283,6 @@ void KerConvDW3x3Stride2_NE16(KerConv_NE16_T *Arg)
 	SetNE16_WOffset       (Arg->W_Offset);
 	SetNE16_GenConfig     (Gen_Cfg);
 
-	subtile_j_major ++;
-	if(subtile_j_major==W_subtiles) {
-		subtile_j_major = 0;
-		subtile_i_major ++;
-		IsLastSubtileH = subtile_i_major==(H_subtiles-1);
-	}
-	IsLastSubtileW = subtile_j_major==(W_subtiles-1);
-
 	// already commit and trigger NE16 computation, while programming the next one
 	NE16_WRITE_CMD(NE16_COMMIT_AND_TRIGGER, NE16_TRIGGER_CMD);
 	if (IsLastSubtileH && IsLastSubtileW) {
@@ -1301,6 +1293,14 @@ void KerConvDW3x3Stride2_NE16(KerConv_NE16_T *Arg)
 		return;
 	}
 
+	subtile_j_major ++;
+	if(subtile_j_major==W_subtiles) {
+		subtile_j_major = 0;
+		subtile_i_major ++;
+		IsLastSubtileH = subtile_i_major>=(H_subtiles-1);
+	}
+	IsLastSubtileW = subtile_j_major==(W_subtiles-1);
+
 	// acquire second job
 	NE16_BARRIER_ACQUIRE(job_id);
 
@@ -1353,14 +1353,6 @@ void KerConvDW3x3Stride2_NE16(KerConv_NE16_T *Arg)
 	SetNE16_WOffset       (Arg->W_Offset);
 	SetNE16_GenConfig     (Gen_Cfg);
 
-	subtile_j_major ++;
-	if(subtile_j_major==W_subtiles) {
-		subtile_j_major = 0;
-		subtile_i_major ++;
-		IsLastSubtileH = subtile_i_major==(H_subtiles-1);
-	}
-	IsLastSubtileW = subtile_j_major==(W_subtiles-1);
-
 	// already commit and trigger NE16 computation, while programming the next one
 	NE16_WRITE_CMD(NE16_COMMIT_AND_TRIGGER, NE16_TRIGGER_CMD);
 	if (IsLastSubtileH && IsLastSubtileW) {
@@ -1371,13 +1363,21 @@ void KerConvDW3x3Stride2_NE16(KerConv_NE16_T *Arg)
 		return;
 	}
 
+	subtile_j_major ++;
+	if(subtile_j_major==W_subtiles) {
+		subtile_j_major = 0;
+		subtile_i_major ++;
+		IsLastSubtileH = subtile_i_major>=(H_subtiles-1);
+	}
+	IsLastSubtileW = subtile_j_major==(W_subtiles-1);
+
 	// main iteration strided conv iteration loop (does not need ne16_c0_config any more)
 	do {
 	        // acquire second job
 	        NE16_BARRIER_ACQUIRE(job_id);
 
-		IsLastSubtileH = subtile_i_major==(H_subtiles-1);
-		IsLastSubtileW = subtile_j_major==(W_subtiles-1);
+		IsLastSubtileH = subtile_i_major>=(H_subtiles-1);
+		IsLastSubtileW = subtile_j_major>=(W_subtiles-1);
 		// update input / output pointers
 		if(IsLastSubtileH && IsLastSubtileW) {
 			Rem_HO = Last_Rem_HO;  Rem_WO = Last_Rem_WO;
@@ -1508,7 +1508,7 @@ void KerConv1D_StrideS_NE16(KerConv_NE16_T *Arg)
 
 		unsigned char *pIn   = In + Tile_InFeat*(w*Sx - Arg->Pad[0] + subfilter_i*Dx);
 		unsigned char *pOut  = Out    + OutBytes*Tile_OutFeat*w;
-		unsigned char *pFilt = Filter + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*subfilter_i;
+		unsigned char *pFilt = Filter + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*subfilter_i;
 		if (LastSubtileW) {
 			Rem_HI = LastSubtileHeight;
 			w += LastSubtileSize;
@@ -1628,7 +1628,7 @@ void KerConvNxMDxDy_StrideSxSy_NE16(KerConv_NE16_T *Arg)
 			// update input / output pointers
 			SetNE16_InPointer     (In + Tile_InFeat*(Wo_F*Sx - Arg->Pad[0] + subfilter_i*Dx + Tile_InW*(Ho_F*Sy - Arg->Pad[2] + subfilter_j*Dy)));
 			SetNE16_OutPointer    (Out + OutBytes*Tile_OutFeat*(Wo_F + Tile_OutW*Ho_F));
-			SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?8:16)*Nb_LoadedKI*(subfilter_i + Fx*subfilter_j));
+			SetNE16_WeightsPointer(Filter + Tile_OutFeat*(Mode16?1:2)*Arg->Qw*Nb_LoadedKI*(subfilter_i + Fx*subfilter_j));
 			SetNE16_Reminders     (Rem_WO, Rem_HO, Rem_KI, Rem_KO, Rem_WO, Rem_HO);
 			SetNE16_Dim           (Nb_KI, Nb_KO, Nb_WO, Nb_HO);
 			SetNE16_GenConfig     (Gen_Cfg);
@@ -2176,7 +2176,7 @@ void KerLinear_16a_NE16(KerLinear_NE16_T *Arg)
 		NE16_BARRIER_ACQUIRE(job_id);
 		// load configuration for the layer
 		SetNE16_OutPointer    (Out);
-		SetNE16_WeightsPointer(Filter+subtile_ki*256*Tile_OutFeat);
+		SetNE16_WeightsPointer(Filter+subtile_ki*256*Tile_OutFeat*Arg->Qw/8);
 		SetNE16_BiasPointer   (Bias);
 		SetNE16_ScalePointer  (Scale);
 		SetNE16_ScaleNPointer (ScaleN);
@@ -2433,7 +2433,7 @@ void KerMatMul_8a_NE16(KerMatMul_NE16_T *Arg)
 }
 
 void Ker_MM_Conv2D_NE16(
-	Ker_MM_Conv_NE16_T *Arg
+	KerConv_MM_NE16_T *Arg
 	)
 
 {
@@ -2443,14 +2443,13 @@ void Ker_MM_Conv2D_NE16(
 	int Fx = Arg->Fx, Sx = Arg->Sx;
 	int Fy = Arg->Fy, Sy = Arg->Sy;
 	int FS = Fx*Fy;
-	int PadL = Arg->Pad[0], PadT = Arg->Pad[2];
+	int PadL = Arg->Pad[0], PadT = Arg->Pad[2], Pad_Val = Arg->Pad_Val;
 	int InFeat = Arg->Tile_InFeat, OutFeat = Arg->Tile_OutFeat;
         int * __restrict__ Bias = Arg->Bias;
         signed char * __restrict__ Out = Arg->Out;
         unsigned char * __restrict__ Scale = Arg->Scale;
         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
         unsigned char * __restrict__ ColBuff1 = Arg->ColBuff;
-        unsigned char * __restrict__ ColBuff2 = Arg->ColBuff + InFeat*FS;
 	int Wo = Arg->Tile_OutW, Ho = Arg->Tile_OutH;
 	unsigned int * Semaphores = Arg->Semaphores;
 
@@ -2469,6 +2468,7 @@ void Ker_MM_Conv2D_NE16(
 
 	int ColBuffSize = ((W_In1+15)/16)*16;
 	int Tail = ColBuffSize / 4;
+        unsigned char * __restrict__ ColBuff2 = Arg->ColBuff + ColBuffSize;
 	((int *)ColBuff1)[Tail-1] = 0; ((int *)ColBuff1)[Tail-2] = 0; ((int *)ColBuff1)[Tail-3] = 0; ((int *)ColBuff1)[Tail-4] = 0;
 	((int *)ColBuff2)[Tail-1] = 0; ((int *)ColBuff2)[Tail-2] = 0; ((int *)ColBuff2)[Tail-3] = 0; ((int *)ColBuff2)[Tail-4] = 0;
 	int PosL = Arg->FirstTile?(-PadT):0;
@@ -2518,9 +2518,9 @@ void Ker_MM_Conv2D_NE16(
 				gap_waitbarrier(0);
 
 				/* Im2Col */
-				for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*FS))[i]=0;
-				if (Iter1&0x2) ((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
-				if (Iter1&0x1) ((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
+				for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*FS))[i]=Pad_Val;
+				if (Iter1&0x2) ((short int *)(ColBuff+F*FS))[Iter1/2-1]=Pad_Val;
+				if (Iter1&0x1) ((signed char *)(ColBuff+F*FS))[Iter1-1]=Pad_Val;
 				int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
 				int OffC = -Lb - Min(PosC, 0);
 				int Size = Rb-Lb;
diff --git a/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.h b/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.h
index 38311e356..17d64ec8f 100644
--- a/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.h
+++ b/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.h
@@ -83,7 +83,7 @@ typedef struct {
 	unsigned char	   Dx;
 	unsigned char	   Dy;
 	unsigned int       Semaphores[2];
-} Ker_MM_Conv_NE16_T;
+} KerConv_MM_NE16_T;
 
 typedef struct {
 	void * __restrict__  In;
@@ -243,7 +243,7 @@ void KerConvNxMDxDy_StrideSxSy_NE16(KerConv_NE16_T *Arg);
 void KerConv3x3Stride1_DxDy_NE16(KerConv_NE16_T *Arg);
 void KerLinear_8a_NE16(KerLinear_NE16_T *Arg);
 void KerLinear_16a_NE16(KerLinear_NE16_T *Arg);
-void Ker_MM_Conv2D_NE16(Ker_MM_Conv_NE16_T *Arg);
+void Ker_MM_Conv2D_NE16(KerConv_MM_NE16_T *Arg);
 void KerMatMul_8a_NE16(KerMatMul_NE16_T *Arg);
 void KerMatMul_8aFast_NE16(KerMatMul_NE16_T *Arg);
 void KerMatMul_16a_NE16(KerMatMul_NE16_T *Arg);
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h
index eb5fdb638..0475c801a 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h
@@ -1206,7 +1206,10 @@ extern void KerParMatMulB32_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg);
 
 extern void KerPar_MM_Conv1x1_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 extern void KerPar_MM_Conv1x1_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+
+extern void Ker_MM_Conv1x1_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 extern void Ker_MM_Conv1x1_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+
 extern void KerPar_MM_Conv1D_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 extern void KerPar_MM_Conv1D_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 extern void KerPar_MM_Conv1D_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg);
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c
index 6f713b20b..1d9b4c173 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c
@@ -3205,11 +3205,39 @@ void KerParMatMulB32_SF_SQ8(KerMatMul_SQ8_T *Arg)
 	for (int i=0; i<Iter/4; i++) {
 		int l2 = 4*i+First;
 		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
+		for (int j=0; j<H_In1/2; j++) {
+			int l1 = 2*j;
+			v4s *pIn1_0 = (v4s *) (In1 + l1*W_In1);
+			v4s *pIn1_1 = (v4s *) (In1 + (l1+1)*W_In1);
+			int S0 = Bias[l1]  <<NormBias, S1=S0, S2=S0, S3=S0;
+			int S4 = Bias[l1+1]<<NormBias, S5=S4, S6=S4, S7=S4;
+			for (int c=0; c<W_In1/4; c++) {
+				v4s C0 = pIn1_0[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
+				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
+				v4s C1 = pIn1_1[c];
+				S4 = gap_sumdotp4(C1, V0, S4); S5 = gap_sumdotp4(C1, V1, S5); S6 = gap_sumdotp4(C1, V2, S6); S7 = gap_sumdotp4(C1, V3, S7);
+			}
+			for (int c=(W_In1/4)*4; c<W_In1; c++) {
+				int C0 = In1[l1*W_In1+c];
+				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
+				int C1 = In1[(l1+1)*W_In1+c];
+				S4 += C1 * In2[(l2+0)*W_In2+c]; S5 += C1 * In2[(l2+1)*W_In2+c]; S6 += C1 * In2[(l2+2)*W_In2+c]; S7 += C1 * In2[(l2+3)*W_In2+c];
+			}
+			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
+			v4s R = gap_pack4(gap_clip(AT_SCALE(S0, Sc, ScN), 7), gap_clip(AT_SCALE(S1, Sc, ScN), 7),
+					  gap_clip(AT_SCALE(S2, Sc, ScN), 7), gap_clip(AT_SCALE(S3, Sc, ScN), 7));
+			*((v4s *) (Out+l1*H_In2 + l2)) = R;
+			unsigned int Sc1 = Scale[l1+1], ScN1 = ScaleN[l1+1];
+			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S4, Sc1, ScN1), 7), gap_clip(AT_SCALE(S5, Sc1, ScN1), 7),
+					   gap_clip(AT_SCALE(S6, Sc1, ScN1), 7), gap_clip(AT_SCALE(S7, Sc1, ScN1), 7));
+			*((v4s *) (Out+(l1+1)*H_In2 + l2)) = R1;
+		}
+		if (H_In1&0x1) {
+			int l1 = H_In1 - 1;
+			v4s *pIn1_0 = (v4s *) (In1 + l1*W_In1);
 			int S0 = Bias[l1]<<NormBias, S1=S0, S2=S0, S3=S0;
 			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
+				v4s C0 = pIn1_0[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
 				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
 			}
 			for (int c=(W_In1/4)*4; c<W_In1; c++) {
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c
index 123186654..81e4e8db1 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c
@@ -84,6 +84,7 @@ void KerPar_MM_Conv1D_SQ8(
 	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
 	
 	int Iter = (L-F)*Fx;
+	int IterOut = Last-First;
 	int PosL = 0;
 	for (int l=0; l<Ho; l++) {
 		int PosC = -PadL;
@@ -156,154 +157,6 @@ void KerPar_MM_Conv1D_SQ8(
 	}
 }
 
-void _KerPar_MM_Conv1x1_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
-
-{
-	/*
-		For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat]
-		ColBuff must be 4 byte aligned large enough to accomodate 2*Align(Feat, 4)
-	*/
-	signed char *__restrict__ In = Arg->In;
-	int W = Arg->W, H = Arg->H;
-	signed char *__restrict__ Filter = Arg->Filter;
-	int Sx = Arg->Sx, Sy = Arg->Sy;
-	int PadL = Arg->Pad[0];
-	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
-
-        int * __restrict__ Bias = Arg->Bias;
-	int NormBias = Arg->Infos[AT_INF_BIASN];
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned char * __restrict__ Scale = Arg->Scale;
-        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-        signed char * __restrict__ ColBuff0 = Arg->ColBuff;
-        signed char * __restrict__ ColBuff1 = Arg->ColBuff+ALIGN(InFeat, 2);
-
-	int Wo = Arg->Wo, Ho = Arg->Ho;
-
-	/* ColBuff must be large enough to accomodate Align(InFeat, 8) elements */
-	v4s * __restrict__ VBuff0 = (v4s *) ColBuff0;
-	v4s * __restrict__ VBuff1 = (v4s *) ColBuff1;
-	unsigned int W_In1 = InFeat;
-	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
-	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
-
-	int Tail = ((W_In1+3)/4);
-	((int *)ColBuff0)[Tail-1] = 0;
-	((int *)ColBuff1)[Tail-1] = 0;
-	
-	int PosL = 0;
-	int Iter = L-F;
-	for (int l=0; l<Ho; l++) {
-		int PosC = 0;
-		for (int c=0; c<Wo/2; c++) {
-			if (Iter>=4) {
-				for (int f=0; f<(Iter/4); f++) {
-					int X0 = ((int *)(In+PosL*W*InFeat + (PosC+0)*InFeat+F))[f];
-					int X1 = ((int *)(In+PosL*W*InFeat + (PosC+1)*InFeat+F))[f];
-					((int *)(ColBuff0+F))[f] = X0;
-					((int *)(ColBuff1+F))[f] = X1;
-				}
-				if (Iter&0x2) {
-					int X0 = ((short int *)(In+PosL*W*InFeat + (PosC+0)*InFeat+F))[Iter/2-1];
-					int X1 = ((short int *)(In+PosL*W*InFeat + (PosC+1)*InFeat+F))[Iter/2-1];
-					((short int *)(ColBuff0+F))[Iter/2-1] = X0;
-					((short int *)(ColBuff1+F))[Iter/2-1] = X1;
-				}
-				if (Iter&0x1) {
-					int X0 = ((signed char *)(In+PosL*W*InFeat + (PosC+0)*InFeat+F))[Iter-1];
-					int X1 = ((signed char *)(In+PosL*W*InFeat + (PosC+1)*InFeat+F))[Iter-1];
-					((signed char *)(ColBuff0+F))[Iter-1] = X0;
-					((signed char *)(ColBuff1+F))[Iter-1] = X1;
-				}
-			} else if (Iter>=2) {
-				if (Iter&0x2) {
-					int X0 = ((short int *)(In+PosL*W*InFeat + (PosC+0)*InFeat+F))[Iter/2-1];
-					int X1 = ((short int *)(In+PosL*W*InFeat + (PosC+1)*InFeat+F))[Iter/2-1];
-					((short int *)(ColBuff0+F))[Iter/2-1] = X0;
-					((short int *)(ColBuff1+F))[Iter/2-1] = X1;
-				}
-				if (Iter&0x1) {
-					int X0 = ((signed char *)(In+PosL*W*InFeat + (PosC+0)*InFeat+F))[Iter-1];
-					int X1 = ((signed char *)(In+PosL*W*InFeat + (PosC+1)*InFeat+F))[Iter-1];
-					((signed char *)(ColBuff0+F))[Iter-1] = X0;
-					((signed char *)(ColBuff1+F))[Iter-1] = X1;
-				}
-			} else {
-				int X0 = In[PosL*W*InFeat + (PosC+0)*InFeat + F];
-				int X1 = In[PosL*W*InFeat + (PosC+1)*InFeat + F];
-				ColBuff0[F] = X0;
-				ColBuff1[F] = X1;
-			}
-			PosC += 2*Sx;
-			gap_waitbarrier(0);
-	                for (int Line=First; Line<Last; Line++) {
-	                        v4s *VIn1 = (v4s *) (&Filter[Line*W_In1 + 0]);
-	                        int S0 = (Bias[Line]<<NormBias);
-				int S1 = S0;
-	                        for (int i=0; i<((W_In1+3)/4); i++) {
-	                                v4s C0 = VIn1[i];
-					v4s V0 = VBuff0[i], V1 = VBuff1[i];
-	                                S0 = gap_sumdotp4(V0, C0, S0);
-	                                S1 = gap_sumdotp4(V1, C0, S1);
-	                        }
-	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        Out[l*Wo*OutFeat + (2*c+0)*OutFeat + Line] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-	                        Out[l*Wo*OutFeat + (2*c+1)*OutFeat + Line] = gap_clip(AT_SCALE(S1, Sc, ScN), 7);
-	                }
-			gap_waitbarrier(0);
-		}
-		if (Wo&0X1) {
-			int c = Wo-1;
-			((int *)ColBuff0)[Tail] = 0;
-			if (Iter>=4) {
-				for (int f=0; f<(Iter/4); f++) {
-					int X0 = ((int *)(In+PosL*W*InFeat + (PosC+0)*InFeat+F))[f];
-					((int *)(ColBuff0+F))[f] = X0;
-				}
-				if (Iter&0x2) {
-					int X0 = ((short int *)(In+PosL*W*InFeat + (PosC+0)*InFeat+F))[Iter/2-1];
-					((short int *)(ColBuff0+F))[Iter/2-1] = X0;
-				}
-				if (Iter&0x1) {
-					int X0 = ((signed char *)(In+PosL*W*InFeat + (PosC+0)*InFeat+F))[Iter-1];
-					((signed char *)(ColBuff0+F))[Iter-1] = X0;
-				}
-			} else if (Iter>=2) {
-				if (Iter&0x2) {
-					int X0 = ((short int *)(In+PosL*W*InFeat + (PosC+0)*InFeat+F))[Iter/2-1];
-					((short int *)(ColBuff0+F))[Iter/2-1] = X0;
-				}
-				if (Iter&0x1) {
-					int X0 = ((signed char *)(In+PosL*W*InFeat + (PosC+0)*InFeat+F))[Iter-1];
-					((signed char *)(ColBuff0+F))[Iter-1] = X0;
-				}
-			} else {
-				int X0 = In[PosL*W*InFeat + (PosC+0)*InFeat + F];
-				ColBuff0[F] = X0;
-			}
-			PosC += Sx;
-			gap_waitbarrier(0);
-	                for (int Line=First; Line<Last; Line++) {
-	                        v4s *VIn1 = (v4s *) (&Filter[Line*W_In1 + 0]);
-	                        int S0 = (Bias[Line]<<NormBias);
-				int S1 = S0;
-	                        for (int i=0; i<((W_In1+7)/8); i++) {
-	                                v4s V0 = VIn1[2*i], V1 = VIn1[2*i+1];
-					v4s C0 = VBuff0[2*i], C1 = VBuff0[2*i+1];
-	                                S0 = gap_sumdotp4(V0, C0, S0);
-	                                S0 = gap_sumdotp4(V1, C1, S0);
-	                        }
-	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        Out[l*Wo*OutFeat + (c)*OutFeat + Line] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-	                }
-			gap_waitbarrier(0);
-		}
-		PosL += Sy;
-	}
-}
-
 static void __attribute__ ((noinline)) MatMul2Out(
 		signed char *__restrict__ pI,
 		signed char *__restrict__ pC,
@@ -702,6 +555,166 @@ void KerPar_MM_Conv1x1_ReLU_HWC_SQ8(
 	gap_waitbarrier(0);
 }
 
+void Ker_MM_Conv1x1_HWC_SQ8(
+	Ker_MM_Conv_SQ8_T *Arg
+	)
+
+{
+	/*
+		For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat]
+	*/
+	signed char *__restrict__ In = Arg->In;
+	int W = Arg->W, H = Arg->H;
+	signed char *__restrict__ Filter = Arg->Filter;
+	int Sx = Arg->Sx, Sy = Arg->Sy;
+	unsigned int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
+
+        int * __restrict__ Bias = Arg->Bias;
+	int NormBias = Arg->Infos[AT_INF_BIASN];
+        signed char * __restrict__ Out = Arg->Out;
+        unsigned char * __restrict__ Scale = Arg->Scale;
+        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
+
+	int Wo = Arg->Wo, Ho = Arg->Ho;
+
+	unsigned int CoreId = gap_coreid();
+	unsigned int ChunkCell = ChunkSize(Ho), First = Min(Ho, CoreId*ChunkCell), Last  = Min(Ho, First+ChunkCell);
+	int IterOut = Last - First;
+
+	for (int l=First; l<Last; l++) {
+		int PosC = 0;
+		for (int c=0; c<Wo/2; c++) {
+			int *pBias = Bias;
+			signed char *pOut0 = Out+l*Wo*OutFeat + (2*c+0)*OutFeat;
+			signed char *pOut1 = Out+l*Wo*OutFeat + (2*c+1)*OutFeat;
+			signed char *pC = Filter;
+			signed char *pI = (In+l*Sy*W*InFeat + (PosC+0)*InFeat);
+			unsigned char *pSc = Scale;
+			unsigned char *pScN = ScaleN;
+
+			for (int i=0; i<(OutFeat/4); i++) {
+				signed char *pIn0 = pI, *pIn1 = pIn0 + Sx*InFeat,
+					    *pC0 = pC, *pC1 = pC0+InFeat, *pC2 = pC1+InFeat, *pC3 = pC2+InFeat;
+				pC=pC3+InFeat;
+	                        int S00 = (*pBias)<<NormBias, S01 = S00; pBias++;
+	                        int S10 = (*pBias)<<NormBias, S11 = S10; pBias++;
+	                        int S20 = (*pBias)<<NormBias, S21 = S20; pBias++;
+	                        int S30 = (*pBias)<<NormBias, S31 = S30; pBias++;
+				for (int f=0; f<(InFeat/4); f++) {
+					v4s V0 = *((v4s *)pIn0), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
+	                               	S00 = gap_sumdotp4(V0, C0, S00); S01 = gap_sumdotp4(V1, C0, S01);
+	                               	S10 = gap_sumdotp4(V0, C1, S10); S11 = gap_sumdotp4(V1, C1, S11);
+	                               	S20 = gap_sumdotp4(V0, C2, S20); S21 = gap_sumdotp4(V1, C2, S21);
+	                               	S30 = gap_sumdotp4(V0, C3, S30); S31 = gap_sumdotp4(V1, C3, S31);
+					pIn0+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
+				}
+				for (int f=4*(InFeat/4); f<InFeat; f++) {
+					int V0 = *pIn0, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
+					S00 += V0*C0; S01 += V1*C0;
+					S10 += V0*C1; S11 += V1*C1;
+					S20 += V0*C2; S21 += V1*C2;
+					S30 += V0*C3; S31 += V1*C3;
+					pIn0++; pIn1++; pC0++; pC1++; pC2++; pC3++;
+				}
+	                        unsigned int Sc, ScN;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
+				*pOut1 = gap_clip(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				*pOut0 = gap_clip(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
+				*pOut1 = gap_clip(AT_SCALE(S11, Sc, ScN), 7); pOut1++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				*pOut0 = gap_clip(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
+				*pOut1 = gap_clip(AT_SCALE(S21, Sc, ScN), 7); pOut1++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				*pOut0 = gap_clip(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
+				*pOut1 = gap_clip(AT_SCALE(S31, Sc, ScN), 7); pOut1++;
+	                }
+			for (int i=4*(OutFeat/4); i<OutFeat; i++) {
+				signed char *pIn0 = pI, *pIn1 = pIn0 + InFeat;
+	                        int S00 = (*pBias)<<NormBias, S01 = S00; pBias++;
+				for (int f=0; f<(InFeat/4); f++) {
+					v4s V0 = *((v4s *)pIn0), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC);
+	                               	S00 = gap_sumdotp4(V0, C0, S00); S01 = gap_sumdotp4(V1, C0, S01);
+					pIn0+=4; pIn1+=4; pC+=4;
+				}
+				for (int f=4*(InFeat/4); f<InFeat; f++) {
+					int V0 = *pIn0, V1 = *pIn1, C0 = *pC;
+					S00 += V0*C0; S01 += V1*C0;
+					pIn0++; pIn1++; pC++;
+				}
+	                        unsigned int Sc, ScN;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
+				*pOut1 = gap_clip(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
+			}
+			PosC += 2*Sx;
+		}
+		if (Wo&0X1) {
+			PosC = (Wo/2)*2*Sx;
+			int *pBias = Bias;
+			signed char *pOut0 = Out+l*Wo*OutFeat + (Wo-1)*OutFeat;
+			signed char *pC = Filter;
+			signed char *pI = (In+Sy*l*W*InFeat + (PosC+0)*InFeat);
+			unsigned char *pSc = Scale;
+			unsigned char *pScN = ScaleN;
+
+			for (int i=0; i<(OutFeat/4); i++) {
+				signed char *pIn0 = pI,
+					    *pC0 = pC, *pC1 = pC0+InFeat, *pC2 = pC1+InFeat, *pC3 = pC2+InFeat;
+				pC=pC3+InFeat;
+	                        int S00 = (*pBias)<<NormBias; pBias++;
+	                        int S10 = (*pBias)<<NormBias; pBias++;
+	                        int S20 = (*pBias)<<NormBias; pBias++;
+	                        int S30 = (*pBias)<<NormBias; pBias++;
+				for (int f=0; f<(InFeat/4); f++) {
+					v4s V0 = *((v4s *)pIn0), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
+	                               	S00 = gap_sumdotp4(V0, C0, S00);
+	                               	S10 = gap_sumdotp4(V0, C1, S10);
+	                               	S20 = gap_sumdotp4(V0, C2, S20);
+	                               	S30 = gap_sumdotp4(V0, C3, S30);
+					pIn0+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
+				}
+				for (int f=4*(InFeat/4); f<InFeat; f++) {
+					int V0 = *pIn0, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
+					S00 += V0*C0;
+					S10 += V0*C1;
+					S20 += V0*C2;
+					S30 += V0*C3;
+					pIn0++; pC0++; pC1++; pC2++; pC3++;
+				}
+	                        unsigned int Sc, ScN;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				*pOut0 = gap_clip(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				*pOut0 = gap_clip(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				*pOut0 = gap_clip(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
+			}
+			for (int i=4*(OutFeat/4); i<OutFeat; i++) {
+				signed char *pIn0 = pI;
+	                        int S00 = (*pBias)<<NormBias; pBias++;
+				for (int f=0; f<(InFeat/4); f++) {
+					v4s V0 = *((v4s *)pIn0), C0 = *((v4s *)pC);
+	                               	S00 = gap_sumdotp4(V0, C0, S00);
+					pIn0+=4; pC+=4;
+				}
+				for (int f=4*(InFeat/4); f<InFeat; f++) {
+					int V0 = *pIn0, C0 = *pC;
+					S00 += V0*C0;
+					pIn0++; pC++;
+				}
+	                        unsigned int Sc, ScN;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
+			}
+		}
+	}
+	gap_waitbarrier(0);
+}
+
 void Ker_MM_Conv1x1_ReLU_HWC_SQ8(
 	Ker_MM_Conv_SQ8_T *Arg
 	)
@@ -893,14 +906,120 @@ void KerPar_MM_Conv1D_HWC_SQ8(
 	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
 
 	int Tail = 2*((W_In1+7)/8);
+	signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail;
 	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
-	
+	((int *)ColBuff1)[Tail-1] = 0; ((int *)ColBuff1)[Tail-2] = 0;
 	int PosL = 0;
 	int Iter = L-F;
 	int Iter1 = Iter*Fx;
-	for (int l=0; l<Ho; l++) {
+	int IterOut = Max(0, Last - First);
+	// for (int l=0; l<Ho; l++) {
 		int PosC = -PadL;
-		for (int c=0; c<Wo; c++) {
+		for (int c=0; c<(Wo/2); c++) {
+			for (int i=0; i<(Iter1/4); i++) {
+				((int *)(ColBuff+F*Fx))[i]=0;
+				((int *)(ColBuff1+F*Fx))[i]=0;
+			}
+			if (Iter1&0x2) {
+				((short int *)(ColBuff+F*Fx))[Iter1/2-1]=0;
+				((short int *)(ColBuff1+F*Fx))[Iter1/2-1]=0;
+			}
+			if (Iter1&0x1) {
+				((signed char *)(ColBuff+F*Fx))[Iter1-1]=0;
+				((signed char *)(ColBuff1+F*Fx))[Iter1-1]=0;
+			}
+			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W), Off = -Lb - Min(PosC, 0);
+			int Lb1 = Max(PosC+Sx, 0), Rb1 = Min(PosC+Fx+Sx, W), Off1 = -Lb1 - Min(PosC+Sx, 0);
+			if (Iter>0) {
+				if (Iter>=4) {
+					for (int f=0; f<(Iter/4); f++) {
+						for (int i=Lb; i<Rb; i++) ((int *)(ColBuff+(i+Off)*InFeat+F))[f] = ((int *)(In+ i*InFeat+F))[f];
+						for (int i=Lb1; i<Rb1; i++) ((int *)(ColBuff1+(i+Off1)*InFeat+F))[f] = ((int *)(In+ i*InFeat+F))[f];
+					}
+					if (Iter&0x2) {
+						for (int i=Lb; i<Rb; i++) ((short int *)(ColBuff+(i+Off)*InFeat+F))[Iter/2-1] = ((short int *)(In+ i*InFeat+F))[Iter/2-1];
+						for (int i=Lb1; i<Rb1; i++) ((short int *)(ColBuff1+(i+Off1)*InFeat+F))[Iter/2-1] = ((short int *)(In+ i*InFeat+F))[Iter/2-1];
+					}
+					if (Iter&0x1) {
+						for (int i=Lb; i<Rb; i++) ((signed char *)(ColBuff+(i+Off)*InFeat+F))[Iter-1] = ((signed char *)(In+ i*InFeat+F))[Iter-1];
+						for (int i=Lb1; i<Rb1; i++) ((signed char *)(ColBuff1+(i+Off1)*InFeat+F))[Iter-1] = ((signed char *)(In+ i*InFeat+F))[Iter-1];
+					}
+				} else if (Iter>=2) {
+					if (Iter&0x2) {
+						for (int i=Lb; i<Rb; i++) ((short int *)(ColBuff+(i+Off)*InFeat+F))[Iter/2-1] = ((short int *)(In+ i*InFeat+F))[Iter/2-1];
+						for (int i=Lb1; i<Rb1; i++) ((short int *)(ColBuff1+(i+Off1)*InFeat+F))[Iter/2-1] = ((short int *)(In+ i*InFeat+F))[Iter/2-1];
+					}
+					if (Iter&0x1) {
+						for (int i=Lb; i<Rb; i++) ((signed char *)(ColBuff+(i+Off)*InFeat+F))[Iter-1] = ((signed char *)(In+ i*InFeat+F))[Iter-1];
+						for (int i=Lb1; i<Rb1; i++) ((signed char *)(ColBuff1+(i+Off1)*InFeat+F))[Iter-1] = ((signed char *)(In+ i*InFeat+F))[Iter-1];
+					}
+				} else {
+					for (int i=Lb; i<Rb; i++) ColBuff[(i+Off)*InFeat + F] = In[ i*InFeat + F];
+					for (int i=Lb1; i<Rb1; i++) ColBuff1[(i+Off1)*InFeat + F] = In[ i*InFeat + F];
+				}
+			}
+			PosC += 2*Sx;
+			gap_waitbarrier(0);
+
+			int *pBias = Bias + First;
+			signed char *pC = Filter + W_In1*First;
+			signed char *pOut0 = Out + (2*c+0)*OutFeat+First;
+			signed char *pOut1 = Out + (2*c+1)*OutFeat+First;
+			unsigned char *pSc = Scale + First;
+			unsigned char *pScN = ScaleN + First;
+	                for (int Line=0; Line<IterOut/4; Line++) {
+				signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
+				pC=pC3+W_In1;
+	                        int S0 = (*pBias)<<NormBias, S4=S0; pBias++;
+	                        int S1 = (*pBias)<<NormBias, S5=S1; pBias++;
+	                        int S2 = (*pBias)<<NormBias, S6=S2; pBias++;
+	                        int S3 = (*pBias)<<NormBias, S7=S3; pBias++;
+				signed char *pIn = ColBuff;
+				signed char *pIn1 = ColBuff1;
+	                        for (int i=0; i<(W_In1/4); i++) {
+					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
+	                                S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4); 
+	                                S1 = gap_sumdotp4(V0, C1, S1); S5 = gap_sumdotp4(V1, C1, S5); 
+	                                S2 = gap_sumdotp4(V0, C2, S2); S6 = gap_sumdotp4(V1, C2, S6); 
+	                                S3 = gap_sumdotp4(V0, C3, S3); S7 = gap_sumdotp4(V1, C3, S7); 
+					pIn+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
+	                        }
+				for (int f=4*(W_In1/4); f<W_In1; f++) {
+					int V0 = *pIn, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
+					S0 += V0*C0; S4 += V1*C0;
+					S1 += V0*C1; S5 += V1*C1;
+					S2 += V0*C2; S6 += V1*C2;
+					S3 += V0*C3; S7 += V1*C3;
+					pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
+				}
+				v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
+						   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7),
+						   gap_clip(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				*((v4s *) (pOut0+4*Line)) = R1;
+				*((v4s *) (pOut1+4*Line)) = R2;
+	                }
+			for (int i=4*(IterOut/4); i<IterOut; i++) {
+				signed char *pIn = ColBuff;
+				signed char *pIn1 = ColBuff1;
+	                        int S0 = (*pBias)<<NormBias, S4=S0; pBias++;
+	                        for (int i=0; i<(W_In1/4); i++) {
+					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC);
+	                               	S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4);
+					pIn+=4; pIn1+=4; pC+=4;
+				}
+				for (int f=4*(W_In1/4); f<W_In1; f++) {
+					int V0 = *pIn, V1 = *pIn1, C0 = *pC;
+					S0 += V0*C0; S4 += V1*C0;
+					pIn++; pIn1++; pC++;
+				}
+				*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
+				*(pOut1+i) = gap_clip(AT_SCALE(S4, pSc[i], pScN[i]), 7);
+			}
+			gap_waitbarrier(0);
+		}
+		if (Wo&0x1) {
+			int c = Wo-1;
 			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*Fx))[i]=0;
 			if (Iter1&0x2) ((short int *)(ColBuff+F*Fx))[Iter1/2-1]=0;
 			if (Iter1&0x1) ((signed char *)(ColBuff+F*Fx))[Iter1-1]=0;
@@ -919,134 +1038,61 @@ void KerPar_MM_Conv1D_HWC_SQ8(
 			}
 			PosC += Sx;
 			gap_waitbarrier(0);
-	                for (int Line=First; Line<Last; Line++) {
-	                        v4s *VIn1 = (v4s *) (&Filter[Line*W_In1 + 0]);
-	                        int S0 = (Bias[Line]<<NormBias);
-	                        for (int i=0; i<((W_In1+7)/8); i++) {
-	                                v4s V0 = VIn1[2*i], V1 = VIn1[2*i+1];
-					v4s C0 = VBuff[2*i], C1 = VBuff[2*i+1];
+
+			int *pBias = Bias + First;
+			signed char *pC = Filter + W_In1*First;
+			signed char *pOut0 = Out + (c)*OutFeat+First;
+			unsigned char *pSc = Scale + First;
+			unsigned char *pScN = ScaleN + First;
+	                for (int Line=0; Line<IterOut/4; Line++) {
+				signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
+				pC=pC3+W_In1;
+	                        int S0 = (*pBias)<<NormBias; pBias++;
+	                        int S1 = (*pBias)<<NormBias; pBias++;
+	                        int S2 = (*pBias)<<NormBias; pBias++;
+	                        int S3 = (*pBias)<<NormBias; pBias++;
+				signed char *pIn = ColBuff;
+	                        for (int i=0; i<(W_In1/4); i++) {
+					v4s V0 = *((v4s *)pIn), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
 	                                S0 = gap_sumdotp4(V0, C0, S0);
-	                                S0 = gap_sumdotp4(V1, C1, S0);
+	                                S1 = gap_sumdotp4(V0, C1, S1);
+	                                S2 = gap_sumdotp4(V0, C2, S2);
+	                                S3 = gap_sumdotp4(V0, C3, S3);
+					pIn+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
 	                        }
-	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        Out[l*Wo*OutFeat + c*OutFeat + Line] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
+				for (int f=4*(W_In1/4); f<W_In1; f++) {
+					int V0 = *pIn, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
+					S0 += V0*C0;
+					S1 += V0*C1;
+					S2 += V0*C2;
+					S3 += V0*C3;
+					pIn++; pC0++; pC1++; pC2++; pC3++;
+				}
+				v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
+						   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				*((v4s *) (pOut0+4*Line)) = R1;
 	                }
+			for (int i=4*(IterOut/4); i<IterOut; i++) {
+				signed char *pIn = ColBuff;
+	                        int S0 = (*pBias)<<NormBias; pBias++;
+	                        for (int i=0; i<(W_In1/4); i++) {
+					v4s V0 = *((v4s *)pIn), C0 = *((v4s *)pC);
+	                               	S0 = gap_sumdotp4(V0, C0, S0);
+					pIn+=4; pC+=4;
+				}
+				for (int f=4*(W_In1/4); f<W_In1; f++) {
+					int V0 = *pIn, C0 = *pC;
+					S0 += V0*C0;
+					pIn++; pC++;
+				}
+				*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
+			}
 			gap_waitbarrier(0);
 		}
-		PosL += Sy;
-	}
+	// 	PosL += Sy;
+	// }
 }
 
-// void KerPar_MM_Conv1D_HWC_SQ8(
-// 	Ker_MM_Conv_SQ8_T *Arg
-// 	)
-
-// {
-// 	/*
-// 		For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat]
-// 	*/
-// 	signed char *__restrict__ In = Arg->In;
-// 	int W = Arg->W, H = Arg->H;
-// 	signed char *__restrict__ Filter = Arg->Filter;
-// 	int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy;
-// 	int PadL = Arg->Pad[0];
-// 	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
-
-//         int * __restrict__ Bias = Arg->Bias;
-// 	int NormBias = Arg->Infos[AT_INF_BIASN];
-//         signed char * __restrict__ Out = Arg->Out;
-//         unsigned char * __restrict__ Scale = Arg->Scale;
-//         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-
-// 	int Wo = Arg->Wo, Ho = Arg->Ho;
-
-// 	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
-// 	unsigned int W_In1 = InFeat*Fx;
-// 	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
-// 	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
-//         unsigned int Iter = (Last>First)?(Last-First):0;
-
-// 	int PosL = 0;
-// 	int PosC = -PadL;
-// 	for (int c=0; c<Wo/4; c++) {
-// 		v4s * __restrict__ VIn1 = (v4s *) (In + (4*c  )*Sx*InFeat);
-// 		v4s * __restrict__ VIn2 = (v4s *) (In + (4*c+1)*Sx*InFeat);
-// 		v4s * __restrict__ VIn3 = (v4s *) (In + (4*c+2)*Sx*InFeat);
-// 		v4s * __restrict__ VIn4 = (v4s *) (In + (4*c+3)*Sx*InFeat);
-//                 for (int c_out=0; c_out<Iter/2; c_out++) {
-//                 	int col = First + 2*c_out;
-//                         v4s *VFilter1 = (v4s *) (&Filter[(col  )*W_In1]);
-//                         v4s *VFilter2 = (v4s *) (&Filter[(col+1)*W_In1]);
-//                         int S1 = (Bias[col  ]<<NormBias), S3=S1, S5=S1, S7=S1;
-//                         int S2 = (Bias[col+1]<<NormBias), S4=S2, S6=S2, S8=S2;
-//                         for (int i=0; i<(W_In1/4); i++) {
-//                                 S1 = gap_sumdotp4(VIn1[i], VFilter1[i], S1); S2 = gap_sumdotp4(VIn1[i], VFilter2[i], S2);
-//                                 S3 = gap_sumdotp4(VIn2[i], VFilter1[i], S3); S4 = gap_sumdotp4(VIn2[i], VFilter2[i], S4);
-//                                 S5 = gap_sumdotp4(VIn3[i], VFilter1[i], S5); S6 = gap_sumdotp4(VIn3[i], VFilter2[i], S6);
-//                                 S7 = gap_sumdotp4(VIn4[i], VFilter1[i], S7); S8 = gap_sumdotp4(VIn4[i], VFilter2[i], S8);
-//                         }
-//                         for (int i=(W_In1/4)*4; i<W_In1; i++) {
-//                         	S1 += In[(4*c  )*Sx*InFeat+i]*Filter[(col)*W_In1+i]; S2 += In[(4*c  )*Sx*InFeat+i]*Filter[(col+1)*W_In1+i];
-//                         	S3 += In[(4*c+1)*Sx*InFeat+i]*Filter[(col)*W_In1+i]; S4 += In[(4*c+1)*Sx*InFeat+i]*Filter[(col+1)*W_In1+i];
-//                         	S5 += In[(4*c+2)*Sx*InFeat+i]*Filter[(col)*W_In1+i]; S6 += In[(4*c+2)*Sx*InFeat+i]*Filter[(col+1)*W_In1+i];
-//                         	S7 += In[(4*c+3)*Sx*InFeat+i]*Filter[(col)*W_In1+i]; S8 += In[(4*c+3)*Sx*InFeat+i]*Filter[(col+1)*W_In1+i];
-//                         }
-//                         unsigned int Sc1 = Scale[col  ], ScN1 = ScaleN[col  ];
-//                         unsigned int Sc2 = Scale[col+1], ScN2 = ScaleN[col+1];
-// 			v2s R1 = gap_pack2(gap_clip(AT_SCALE(S1, Sc1, ScN1), 7), gap_clip(AT_SCALE(S2, Sc2, ScN2), 7));
-// 			v2s R2 = gap_pack2(gap_clip(AT_SCALE(S3, Sc1, ScN1), 7), gap_clip(AT_SCALE(S4, Sc2, ScN2), 7));
-// 			v2s R3 = gap_pack2(gap_clip(AT_SCALE(S5, Sc1, ScN1), 7), gap_clip(AT_SCALE(S6, Sc2, ScN2), 7));
-// 			v2s R4 = gap_pack2(gap_clip(AT_SCALE(S7, Sc1, ScN1), 7), gap_clip(AT_SCALE(S8, Sc2, ScN2), 7));
-//                         *((v2s *) (&Out[(4*c  )*OutFeat + (col)])) = R1;
-//                         *((v2s *) (&Out[(4*c+1)*OutFeat + (col)])) = R2;
-//                         *((v2s *) (&Out[(4*c+2)*OutFeat + (col)])) = R3;
-//                         *((v2s *) (&Out[(4*c+3)*OutFeat + (col)])) = R4;
-//                 }
-//                 if (Iter&0x1) {
-//                 	int col = Last-1;
-//                         v4s *VFilter1 = (v4s *) (&Filter[(col)*W_In1]);
-//                         int S1 = (Bias[col  ]<<NormBias), S3=S1, S5=S1, S7=S1;
-//                         for (int i=0; i<(W_In1/4); i++) {
-//                                 S1 = gap_sumdotp4(VIn1[i], VFilter1[i], S1);
-//                                 S3 = gap_sumdotp4(VIn2[i], VFilter1[i], S3);
-//                                 S5 = gap_sumdotp4(VIn3[i], VFilter1[i], S5);
-//                                 S7 = gap_sumdotp4(VIn4[i], VFilter1[i], S7);
-//                         }
-//                         for (int i=(W_In1/4)*4; i<W_In1; i++) {
-//                         	S1 += In[(4*c  )*Sx*InFeat+i]*Filter[(col)*W_In1+i];
-//                         	S3 += In[(4*c+1)*Sx*InFeat+i]*Filter[(col)*W_In1+i];
-//                         	S5 += In[(4*c+2)*Sx*InFeat+i]*Filter[(col)*W_In1+i];
-//                         	S7 += In[(4*c+3)*Sx*InFeat+i]*Filter[(col)*W_In1+i];
-//                         }
-//                         unsigned int Sc1 = Scale[col  ], ScN1 = ScaleN[col  ];
-// 			Out[(4*c  )*OutFeat + (col)] = gap_clip(AT_SCALE(S1, Sc1, ScN1), 7);
-// 			Out[(4*c+1)*OutFeat + (col)] = gap_clip(AT_SCALE(S3, Sc1, ScN1), 7);
-// 			Out[(4*c+2)*OutFeat + (col)] = gap_clip(AT_SCALE(S5, Sc1, ScN1), 7);
-// 			Out[(4*c+3)*OutFeat + (col)] = gap_clip(AT_SCALE(S7, Sc1, ScN1), 7);
-
-//                 }
-// 		gap_waitbarrier(0);
-// 	}
-// 	for (int c=(Wo/4)*4; c<Wo; c++) {
-// 		v4s * __restrict__ VBuff1 = (v4s *) (In + (c)*Sx*InFeat);
-//                 for (int c_out=First; c_out<Last; c_out++) {
-//                         v4s *VIn1 = (v4s *) (&Filter[c_out*W_In1]);
-//                         int S1 = (Bias[c_out]<<NormBias);
-//                         for (int i=0; i<(W_In1/4); i++) {
-//                                 v4s V1 = VIn1[i];
-// 				v4s C11 = VBuff1[i];
-//                                 S1 = gap_sumdotp4(V1, C11, S1);
-//                         }
-//                         for (int i=(W_In1/4)*4; i<W_In1; i++) {
-//                         	S1 += In[(c)*Sx*InFeat+i]*Filter[c_out*W_In1+i];
-//                         }
-//                         unsigned int Sc = Scale[c_out], ScN = ScaleN[c_out];
-//                         Out[(c)*OutFeat + c_out] = gap_clip(AT_SCALE(S1, Sc, ScN), 7);
-//                 }
-// 		gap_waitbarrier(0);
-// 	}
-// }
-
 void KerPar_MM_Conv1D_DxDy_SQ8(
 	Ker_MM_Conv_SQ8_T *Arg
 	)
@@ -1150,7 +1196,11 @@ void KerPar_MM_Conv1D_DxDy_HWC_SQ8(
 	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
 
 	int Tail = 2*((W_In1+7)/8);
+	signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail;
 	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
+	((int *)ColBuff1)[Tail-1] = 0; ((int *)ColBuff1)[Tail-2] = 0;
+
+	int IterOut = Max(0, Last - First);
 	
 	int DFx = Dx*(Fx-1)+1;
 	// int Prec=10;
@@ -1159,53 +1209,210 @@ void KerPar_MM_Conv1D_DxDy_HWC_SQ8(
 	int PosL = 0;
 	int Iter = L-F;
 	int Iter1 = Iter*Fx;
-	for (int l=0; l<Ho; l++) {
-		int PosC = -PadL;
-		for (int c=0; c<Wo; c++) {
-			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*Fx))[i]=0;
-			if (Iter1&0x2) ((short int *)(ColBuff+F*Fx))[Iter1/2-1]=0;
-			if (Iter1&0x1) ((signed char *)(ColBuff+F*Fx))[Iter1-1]=0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+DFx, W);
-			int OffBuffX = Max(0, gap_mulsN(-PosC+Dx-1, InvDx, Prec));
-			int OffInX = OffBuffX?(Dx*OffBuffX+PosC):0;
-			int IterX = gap_mulsN(Rb-Lb-1, InvDx, Prec) + 1;
-			if (Iter>=4) {
-				for (int f=0; f<(Iter/4); f++)
-					for (int i=0; i<IterX; i++)
-						((int *)(ColBuff+(i+OffBuffX)*InFeat+F))[f] = ((int *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[f];
-				if (Iter&0x2)
-					for (int i=0; i<IterX; i++)
-						((short int *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter/2-1] = ((short int *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter/2-1];
-				if (Iter&0x1)
-					for (int i=0; i<IterX; i++)
-						((signed char *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter-1] = ((signed char *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter-1];
-			} else if (Iter>=2) {
-				if (Iter&0x2)
-					for (int i=0; i<IterX; i++)
-						((short int *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter/2-1] = ((short int *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter/2-1];
-				if (Iter&0x1)
-					for (int i=0; i<IterX; i++)
-						((signed char *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter-1] = ((signed char *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter-1];
-			} else if (Iter>0)
+	int PosC = -PadL;
+	for (int c=0; c<(Wo/2); c++) {
+		for (int i=0; i<(Iter1/4); i++) {
+			((int *)(ColBuff+F*Fx))[i]=0;
+			((int *)(ColBuff1+F*Fx))[i]=0;
+		}
+		if (Iter1&0x2) {
+			((short int *)(ColBuff+F*Fx))[Iter1/2-1]=0;
+			((short int *)(ColBuff1+F*Fx))[Iter1/2-1]=0;
+		}
+		if (Iter1&0x1) {
+			((signed char *)(ColBuff+F*Fx))[Iter1-1]=0;
+			((signed char *)(ColBuff1+F*Fx))[Iter1-1]=0;
+		}
+		int Lb = Max(PosC, 0), Rb = Min(PosC+DFx, W);
+		int OffBuffX = Max(0, gap_mulsN(-PosC+Dx-1, InvDx, Prec));
+		int OffInX = OffBuffX?(Dx*OffBuffX+PosC):0;
+		int IterX = gap_mulsN(Rb-Lb-1, InvDx, Prec) + 1;
+
+		int Lb1 = Max(PosC+Sx, 0), Rb1 = Min(PosC+Sx+DFx, W);
+		int OffBuffX1 = Max(0, gap_mulsN(-PosC-Sx+Dx-1, InvDx, Prec));
+		int OffInX1 = OffBuffX1?(Dx*OffBuffX1+PosC+Sx):0;
+		int IterX1 = gap_mulsN(Rb1-Lb1-1, InvDx, Prec) + 1;
+
+		if (Iter>=4) {
+			for (int f=0; f<(Iter/4); f++) {
+				for (int i=0; i<IterX; i++) 
+					((int *)(ColBuff+(i+OffBuffX)*InFeat+F))[f] = ((int *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[f];
+				for (int i=0; i<IterX1; i++) 
+					((int *)(ColBuff1+(i+OffBuffX1)*InFeat+F))[f] = ((int *)(In+PosL*W*InFeat + (i*Dx+OffInX1+Lb1)*InFeat+F))[f];
+			}
+			if (Iter&0x2) {
+				for (int i=0; i<IterX; i++)
+					((short int *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter/2-1] = ((short int *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter/2-1];
+				for (int i=0; i<IterX1; i++)
+					((short int *)(ColBuff1+(i+OffBuffX1)*InFeat+F))[Iter/2-1] = ((short int *)(In+PosL*W*InFeat + (i*Dx+OffInX1+Lb1)*InFeat+F))[Iter/2-1];
+			}
+			if (Iter&0x1) {
+				for (int i=0; i<IterX; i++)
+					((signed char *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter-1] = ((signed char *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter-1];
+				for (int i=0; i<IterX1; i++)
+					((signed char *)(ColBuff1+(i+OffBuffX1)*InFeat+F))[Iter-1] = ((signed char *)(In+PosL*W*InFeat + (i*Dx+OffInX1+Lb1)*InFeat+F))[Iter-1];
+			}
+		} else if (Iter>=2) {
+			if (Iter&0x2) {
+				for (int i=0; i<IterX; i++)
+					((short int *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter/2-1] = ((short int *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter/2-1];
+				for (int i=0; i<IterX1; i++)
+					((short int *)(ColBuff1+(i+OffBuffX1)*InFeat+F))[Iter/2-1] = ((short int *)(In+PosL*W*InFeat + (i*Dx+OffInX1+Lb1)*InFeat+F))[Iter/2-1];
+			}
+			if (Iter&0x1){
+				for (int i=0; i<IterX; i++)
+					((signed char *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter-1] = ((signed char *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter-1];
+				for (int i=0; i<IterX1; i++)
+					((signed char *)(ColBuff1+(i+OffBuffX1)*InFeat+F))[Iter-1] = ((signed char *)(In+PosL*W*InFeat + (i*Dx+OffInX1+Lb1)*InFeat+F))[Iter-1];
+			}
+		} else if (Iter>0) {
+			for (int i=0; i<IterX; i++)
+				ColBuff[(i+OffBuffX)*InFeat + F] = In[PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat + F];
+			for (int i=0; i<IterX1; i++)
+				ColBuff1[(i+OffBuffX1)*InFeat + F] = In[PosL*W*InFeat + (i*Dx+OffInX1+Lb1)*InFeat + F];
+		}
+		PosC += 2*Sx;
+		gap_waitbarrier(0);
+
+
+		int *pBias = Bias + First;
+		signed char *pC = Filter + W_In1*First;
+		signed char *pOut0 = Out + (2*c+0)*OutFeat+First;
+		signed char *pOut1 = Out + (2*c+1)*OutFeat+First;
+		unsigned char *pSc = Scale + First;
+		unsigned char *pScN = ScaleN + First;
+                for (int Line=0; Line<IterOut/4; Line++) {
+			signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
+			pC=pC3+W_In1;
+                        int S0 = (*pBias)<<NormBias, S4=S0; pBias++;
+                        int S1 = (*pBias)<<NormBias, S5=S1; pBias++;
+                        int S2 = (*pBias)<<NormBias, S6=S2; pBias++;
+                        int S3 = (*pBias)<<NormBias, S7=S3; pBias++;
+			signed char *pIn = ColBuff;
+			signed char *pIn1 = ColBuff1;
+                        for (int i=0; i<(W_In1/4); i++) {
+				v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
+                                S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4); 
+                                S1 = gap_sumdotp4(V0, C1, S1); S5 = gap_sumdotp4(V1, C1, S5); 
+                                S2 = gap_sumdotp4(V0, C2, S2); S6 = gap_sumdotp4(V1, C2, S6); 
+                                S3 = gap_sumdotp4(V0, C3, S3); S7 = gap_sumdotp4(V1, C3, S7); 
+				pIn+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
+                        }
+			for (int f=4*(W_In1/4); f<W_In1; f++) {
+				int V0 = *pIn, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
+				S0 += V0*C0; S4 += V1*C0;
+				S1 += V0*C1; S5 += V1*C1;
+				S2 += V0*C2; S6 += V1*C2;
+				S3 += V0*C3; S7 += V1*C3;
+				pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
+			}
+			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
+					   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
+			v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7),
+					   gap_clip(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7));
+			*((v4s *) (pOut0+4*Line)) = R1;
+			*((v4s *) (pOut1+4*Line)) = R2;
+                }
+		for (int i=4*(IterOut/4); i<IterOut; i++) {
+			signed char *pIn = ColBuff;
+			signed char *pIn1 = ColBuff1;
+                        int S0 = (*pBias)<<NormBias, S4=S0; pBias++;
+                        for (int i=0; i<(W_In1/4); i++) {
+				v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC);
+                               	S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4);
+				pIn+=4; pIn1+=4; pC+=4;
+			}
+			for (int f=4*(W_In1/4); f<W_In1; f++) {
+				int V0 = *pIn, V1 = *pIn1, C0 = *pC;
+				S0 += V0*C0; S4 += V1*C0;
+				pIn++; pIn1++; pC++;
+			}
+			*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
+			*(pOut1+i) = gap_clip(AT_SCALE(S4, pSc[i], pScN[i]), 7);
+		}
+		gap_waitbarrier(0);
+	}
+	if (Wo&0x1) {
+		int c = Wo-1;
+		for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*Fx))[i]=0;
+		if (Iter1&0x2) ((short int *)(ColBuff+F*Fx))[Iter1/2-1]=0;
+		if (Iter1&0x1) ((signed char *)(ColBuff+F*Fx))[Iter1-1]=0;
+		int Lb = Max(PosC, 0), Rb = Min(PosC+DFx, W);
+		int OffBuffX = Max(0, gap_mulsN(-PosC+Dx-1, InvDx, Prec));
+		int OffInX = OffBuffX?(Dx*OffBuffX+PosC):0;
+		int IterX = gap_mulsN(Rb-Lb-1, InvDx, Prec) + 1;
+		if (Iter>=4) {
+			for (int f=0; f<(Iter/4); f++)
+				for (int i=0; i<IterX; i++)
+					((int *)(ColBuff+(i+OffBuffX)*InFeat+F))[f] = ((int *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[f];
+			if (Iter&0x2)
 				for (int i=0; i<IterX; i++)
-					ColBuff[(i+OffBuffX)*InFeat + F] = In[PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat + F];
-			PosC += Sx;
-			gap_waitbarrier(0);
-	                for (int Line=First; Line<Last; Line++) {
-	                        v4s *VIn1 = (v4s *) (&Filter[Line*W_In1 + 0]);
-	                        int S0 = (Bias[Line]<<NormBias);
-	                        for (int i=0; i<((W_In1+7)/8); i++) {
-	                                v4s V0 = VIn1[2*i], V1 = VIn1[2*i+1];
-					v4s C0 = VBuff[2*i], C1 = VBuff[2*i+1];
-	                                S0 = gap_sumdotp4(V0, C0, S0);
-	                                S0 = gap_sumdotp4(V1, C1, S0);
-	                        }
-	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        Out[l*Wo*OutFeat + c*OutFeat + Line] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-	                }
-			gap_waitbarrier(0);
+					((short int *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter/2-1] = ((short int *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter/2-1];
+			if (Iter&0x1)
+				for (int i=0; i<IterX; i++)
+					((signed char *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter-1] = ((signed char *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter-1];
+		} else if (Iter>=2) {
+			if (Iter&0x2)
+				for (int i=0; i<IterX; i++)
+					((short int *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter/2-1] = ((short int *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter/2-1];
+			if (Iter&0x1)
+				for (int i=0; i<IterX; i++)
+					((signed char *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter-1] = ((signed char *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter-1];
+		} else if (Iter>0)
+			for (int i=0; i<IterX; i++)
+				ColBuff[(i+OffBuffX)*InFeat + F] = In[PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat + F];
+		PosC += Sx;
+		gap_waitbarrier(0);
+
+		int *pBias = Bias + First;
+		signed char *pC = Filter + W_In1*First;
+		signed char *pOut0 = Out + (c)*OutFeat+First;
+		unsigned char *pSc = Scale + First;
+		unsigned char *pScN = ScaleN + First;
+                for (int Line=0; Line<IterOut/4; Line++) {
+			signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
+			pC=pC3+W_In1;
+                        int S0 = (*pBias)<<NormBias; pBias++;
+                        int S1 = (*pBias)<<NormBias; pBias++;
+                        int S2 = (*pBias)<<NormBias; pBias++;
+                        int S3 = (*pBias)<<NormBias; pBias++;
+			signed char *pIn = ColBuff;
+                        for (int i=0; i<(W_In1/4); i++) {
+				v4s V0 = *((v4s *)pIn), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
+                                S0 = gap_sumdotp4(V0, C0, S0);
+                                S1 = gap_sumdotp4(V0, C1, S1);
+                                S2 = gap_sumdotp4(V0, C2, S2);
+                                S3 = gap_sumdotp4(V0, C3, S3);
+				pIn+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
+                        }
+			for (int f=4*(W_In1/4); f<W_In1; f++) {
+				int V0 = *pIn, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
+				S0 += V0*C0;
+				S1 += V0*C1;
+				S2 += V0*C2;
+				S3 += V0*C3;
+				pIn++; pC0++; pC1++; pC2++; pC3++;
+			}
+			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
+					   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
+			*((v4s *) (pOut0+4*Line)) = R1;
+                }
+		for (int i=4*(IterOut/4); i<IterOut; i++) {
+			signed char *pIn = ColBuff;
+                        int S0 = (*pBias)<<NormBias; pBias++;
+                        for (int i=0; i<(W_In1/4); i++) {
+				v4s V0 = *((v4s *)pIn), C0 = *((v4s *)pC);
+                               	S0 = gap_sumdotp4(V0, C0, S0);
+				pIn+=4; pC+=4;
+			}
+			for (int f=4*(W_In1/4); f<W_In1; f++) {
+				int V0 = *pIn, C0 = *pC;
+				S0 += V0*C0;
+				pIn++; pC++;
+			}
+			*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
 		}
-		PosL += Sy;
+		gap_waitbarrier(0);
 	}
 }
 
@@ -1733,7 +1940,10 @@ void KerPar_MM_Conv2D_HWC_SQ8(
 
 	int FS = Fx*Fy;
 	int Tail = 2*((W_In1+7)/8);
+
+	signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail;
 	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
+	((int *)ColBuff1)[Tail-1] = 0; ((int *)ColBuff1)[Tail-2] = 0;
 	int PosL = Arg->FirstTile?(-PadT):0;
 
 	int Iter = L-F;
@@ -1743,7 +1953,133 @@ void KerPar_MM_Conv2D_HWC_SQ8(
 		int PosC = -PadL;
 		int Tb = Max(PosL, 0), Db = Min(PosL+Fy, H);
 		int OffL = -Tb - Min(PosL, 0);
-		for (int c=0; c<Wo; c++) {
+		for (int c=0; c<(Wo/2); c++) {
+			for (int i=0; i<(Iter1/4); i++) {
+				((int *)(ColBuff+F*FS))[i]=0;
+				((int *)(ColBuff1+F*FS))[i]=0;
+			}
+			if (Iter1&0x2) {
+				((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
+				((short int *)(ColBuff1+F*FS))[Iter1/2-1]=0;
+			}
+			if (Iter1&0x1) {
+				((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
+				((signed char *)(ColBuff1+F*FS))[Iter1-1]=0;
+			}
+			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
+			int Lb1 = Max(PosC+Sx, 0), Rb1 = Min(PosC+Sx+Fx, W);
+			int OffC = -Lb - Min(PosC, 0);
+			int OffC1 = -Lb1 - Min(PosC+Sx, 0);
+                        if (Iter>=4) {
+                                for (int f=0; f<(Iter/4); f++)
+					for (int j=Tb; j<Db; j++) {
+                                        	for (int i=Lb; i<Rb; i++) {
+							((int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
+                                        	}
+                                        	for (int i=Lb1; i<Rb1; i++) {
+							((int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
+                                        	}
+					}
+                                if (Iter&0x2)
+					for (int j=Tb; j<Db; j++) {
+						for (int i=Lb; i<Rb; i++)
+							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
+						for (int i=Lb1; i<Rb1; i++)
+							((short int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
+					}
+                                if (Iter&0x1)
+					for (int j=Tb; j<Db; j++) {
+						for (int i=Lb; i<Rb; i++)
+							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
+						for (int i=Lb1; i<Rb1; i++)
+							((signed char *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
+					}
+                        } else if (Iter>=2) {
+                                if (Iter&0x2)
+					for (int j=Tb; j<Db; j++) {
+						for (int i=Lb; i<Rb; i++)
+							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
+						for (int i=Lb1; i<Rb1; i++)
+							((short int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
+					}
+                                if (Iter&0x1)
+					for (int j=Tb; j<Db; j++) {
+						for (int i=Lb; i<Rb; i++)
+							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
+						for (int i=Lb1; i<Rb1; i++)
+							((signed char *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
+					}
+                        } else if (Iter>0) {
+				for (int j=Tb; j<Db; j++) {
+					for (int i=Lb; i<Rb; i++)
+						ColBuff[(j+OffL)*InFeat*Fx+(i+OffC)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
+					for (int i=Lb1; i<Rb1; i++)
+						ColBuff1[(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
+				}
+			}
+			PosC += 2*Sx;
+			gap_waitbarrier(0);
+
+			int *pBias = Bias + First;
+			signed char *pC = Filter + W_In1*First;
+			signed char *pOut0 = Out+l*Wo*OutFeat + (2*c+0)*OutFeat+First;
+			signed char *pOut1 = Out+l*Wo*OutFeat + (2*c+1)*OutFeat+First;
+			unsigned char *pSc = Scale + First;
+			unsigned char *pScN = ScaleN + First;
+	                for (int Line=0; Line<IterOut/4; Line++) {
+				signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
+				pC=pC3+W_In1;
+	                        int S0 = (pBias[4*Line  ])<<NormBias, S4=S0;
+	                        int S1 = (pBias[4*Line+1])<<NormBias, S5=S1;
+	                        int S2 = (pBias[4*Line+2])<<NormBias, S6=S2;
+	                        int S3 = (pBias[4*Line+3])<<NormBias, S7=S3;
+				signed char *pIn = ColBuff;
+				signed char *pIn1 = ColBuff1;
+	                        for (int i=0; i<(W_In1/4); i++) {
+					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
+	                                S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4); 
+	                                S1 = gap_sumdotp4(V0, C1, S1); S5 = gap_sumdotp4(V1, C1, S5); 
+	                                S2 = gap_sumdotp4(V0, C2, S2); S6 = gap_sumdotp4(V1, C2, S6); 
+	                                S3 = gap_sumdotp4(V0, C3, S3); S7 = gap_sumdotp4(V1, C3, S7); 
+					pIn+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
+	                        }
+				for (int f=4*(W_In1/4); f<W_In1; f++) {
+					int V0 = *pIn, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
+					S0 += V0*C0; S4 += V1*C0;
+					S1 += V0*C1; S5 += V1*C1;
+					S2 += V0*C2; S6 += V1*C2;
+					S3 += V0*C3; S7 += V1*C3;
+					pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
+				}
+				v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
+						   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7),
+						   gap_clip(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				*((v4s *) (pOut0+4*Line)) = R1;
+				*((v4s *) (pOut1+4*Line)) = R2;
+	                }
+			for (int i=4*(IterOut/4); i<IterOut; i++) {
+				signed char *pIn = ColBuff;
+				signed char *pIn1 = ColBuff1;
+	                        int S0 = (pBias[i])<<NormBias, S4=S0;
+	                        for (int i=0; i<(W_In1/4); i++) {
+					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC);
+	                               	S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4);
+					pIn+=4; pIn1+=4; pC+=4;
+				}
+				for (int f=4*(W_In1/4); f<W_In1; f++) {
+					int V0 = *pIn, V1 = *pIn1, C0 = *pC;
+					S0 += V0*C0; S4 += V1*C0;
+					pIn++; pIn1++; pC++;
+				}
+				*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
+				*(pOut1+i) = gap_clip(AT_SCALE(S4, pSc[i], pScN[i]), 7);
+			}
+			gap_waitbarrier(0);
+		}
+		if (Wo&0x1) {
+			int c = Wo-1;
+		// for (int c=0; c<Wo; c++) {
 			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*FS))[i]=0;
 			if (Iter1&0x2) ((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
 			if (Iter1&0x1) ((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
@@ -1772,16 +2108,16 @@ void KerPar_MM_Conv2D_HWC_SQ8(
 						for (int i=Lb; i<Rb; i++)
 							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
                         } else if (Iter>0)
-				for (int j=Tb; j<Db; j++) 
+				for (int j=Tb; j<Db; j++)
 					for (int i=Lb; i<Rb; i++)
 						ColBuff[(j+OffL)*InFeat*Fx+(i+OffC)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
 
 			PosC += Sx;
 			gap_waitbarrier(0);
-	                // for (int Line=First; Line<Last; Line++) {
+
 			int *pBias = Bias + First;
 			signed char *pC = Filter + W_In1*First;
-			signed char *pOut0 = Out+l*Wo*OutFeat + (c+0)*OutFeat+First;
+			signed char *pOut0 = Out+l*Wo*OutFeat + (c)*OutFeat+First;
 			unsigned char *pSc = Scale + First;
 			unsigned char *pScN = ScaleN + First;
 	                for (int Line=0; Line<IterOut/4; Line++) {
@@ -1808,15 +2144,9 @@ void KerPar_MM_Conv2D_HWC_SQ8(
 					S3 += V0*C3;
 					pIn++; pC0++; pC1++; pC2++; pC3++;
 				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S0, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S1, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S2, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S3, Sc, ScN), 7); pOut0++;
+				v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
+						   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				*((v4s *) (pOut0+4*Line)) = R1;
 	                }
 			for (int i=4*(IterOut/4); i<IterOut; i++) {
 				signed char *pIn = ColBuff;
@@ -1831,18 +2161,14 @@ void KerPar_MM_Conv2D_HWC_SQ8(
 					S0 += V0*C0;
 					pIn++; pC++;
 				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S0, Sc, ScN), 7); pOut0++;
+				*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
 			}
 			gap_waitbarrier(0);
 		}
 		PosL += Sy;
 	}
-	gap_waitbarrier(0);
 }
 
-
 void KerPar_MM_Conv2D_ReLU_HWC_SQ8(
 	Ker_MM_Conv_SQ8_T *Arg
 	)
@@ -1875,7 +2201,10 @@ void KerPar_MM_Conv2D_ReLU_HWC_SQ8(
 
 	int FS = Fx*Fy;
 	int Tail = 2*((W_In1+7)/8);
+
+	signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail;
 	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
+	((int *)ColBuff1)[Tail-1] = 0; ((int *)ColBuff1)[Tail-2] = 0;
 	int PosL = Arg->FirstTile?(-PadT):0;
 
 	int Iter = L-F;
@@ -1885,7 +2214,133 @@ void KerPar_MM_Conv2D_ReLU_HWC_SQ8(
 		int PosC = -PadL;
 		int Tb = Max(PosL, 0), Db = Min(PosL+Fy, H);
 		int OffL = -Tb - Min(PosL, 0);
-		for (int c=0; c<Wo; c++) {
+		for (int c=0; c<(Wo/2); c++) {
+			for (int i=0; i<(Iter1/4); i++) {
+				((int *)(ColBuff+F*FS))[i]=0;
+				((int *)(ColBuff1+F*FS))[i]=0;
+			}
+			if (Iter1&0x2) {
+				((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
+				((short int *)(ColBuff1+F*FS))[Iter1/2-1]=0;
+			}
+			if (Iter1&0x1) {
+				((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
+				((signed char *)(ColBuff1+F*FS))[Iter1-1]=0;
+			}
+			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
+			int Lb1 = Max(PosC+Sx, 0), Rb1 = Min(PosC+Sx+Fx, W);
+			int OffC = -Lb - Min(PosC, 0);
+			int OffC1 = -Lb1 - Min(PosC+Sx, 0);
+                        if (Iter>=4) {
+                                for (int f=0; f<(Iter/4); f++)
+					for (int j=Tb; j<Db; j++) {
+                                        	for (int i=Lb; i<Rb; i++) {
+							((int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
+                                        	}
+                                        	for (int i=Lb1; i<Rb1; i++) {
+							((int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
+                                        	}
+					}
+                                if (Iter&0x2)
+					for (int j=Tb; j<Db; j++) {
+						for (int i=Lb; i<Rb; i++)
+							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
+						for (int i=Lb1; i<Rb1; i++)
+							((short int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
+					}
+                                if (Iter&0x1)
+					for (int j=Tb; j<Db; j++) {
+						for (int i=Lb; i<Rb; i++)
+							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
+						for (int i=Lb1; i<Rb1; i++)
+							((signed char *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
+					}
+                        } else if (Iter>=2) {
+                                if (Iter&0x2)
+					for (int j=Tb; j<Db; j++) {
+						for (int i=Lb; i<Rb; i++)
+							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
+						for (int i=Lb1; i<Rb1; i++)
+							((short int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
+					}
+                                if (Iter&0x1)
+					for (int j=Tb; j<Db; j++) {
+						for (int i=Lb; i<Rb; i++)
+							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
+						for (int i=Lb1; i<Rb1; i++)
+							((signed char *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
+					}
+                        } else if (Iter>0) {
+				for (int j=Tb; j<Db; j++) {
+					for (int i=Lb; i<Rb; i++)
+						ColBuff[(j+OffL)*InFeat*Fx+(i+OffC)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
+					for (int i=Lb1; i<Rb1; i++)
+						ColBuff1[(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
+				}
+			}
+			PosC += 2*Sx;
+			gap_waitbarrier(0);
+
+			int *pBias = Bias + First;
+			signed char *pC = Filter + W_In1*First;
+			signed char *pOut0 = Out+l*Wo*OutFeat + (2*c+0)*OutFeat+First;
+			signed char *pOut1 = Out+l*Wo*OutFeat + (2*c+1)*OutFeat+First;
+			unsigned char *pSc = Scale + First;
+			unsigned char *pScN = ScaleN + First;
+	                for (int Line=0; Line<IterOut/4; Line++) {
+				signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
+				pC=pC3+W_In1;
+	                        int S0 = (pBias[4*Line  ])<<NormBias, S4=S0;
+	                        int S1 = (pBias[4*Line+1])<<NormBias, S5=S1;
+	                        int S2 = (pBias[4*Line+2])<<NormBias, S6=S2;
+	                        int S3 = (pBias[4*Line+3])<<NormBias, S7=S3;
+				signed char *pIn = ColBuff;
+				signed char *pIn1 = ColBuff1;
+	                        for (int i=0; i<(W_In1/4); i++) {
+					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
+	                                S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4); 
+	                                S1 = gap_sumdotp4(V0, C1, S1); S5 = gap_sumdotp4(V1, C1, S5); 
+	                                S2 = gap_sumdotp4(V0, C2, S2); S6 = gap_sumdotp4(V1, C2, S6); 
+	                                S3 = gap_sumdotp4(V0, C3, S3); S7 = gap_sumdotp4(V1, C3, S7); 
+					pIn+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
+	                        }
+				for (int f=4*(W_In1/4); f<W_In1; f++) {
+					int V0 = *pIn, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
+					S0 += V0*C0; S4 += V1*C0;
+					S1 += V0*C1; S5 += V1*C1;
+					S2 += V0*C2; S6 += V1*C2;
+					S3 += V0*C3; S7 += V1*C3;
+					pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
+				}
+				v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
+						   AT_CLIP_POS_IMM(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				v4s R2 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]), 7), AT_CLIP_POS_IMM(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7),
+						   AT_CLIP_POS_IMM(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), AT_CLIP_POS_IMM(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				*((v4s *) (pOut0+4*Line)) = R1;
+				*((v4s *) (pOut1+4*Line)) = R2;
+	                }
+			for (int i=4*(IterOut/4); i<IterOut; i++) {
+				signed char *pIn = ColBuff;
+				signed char *pIn1 = ColBuff1;
+	                        int S0 = (pBias[i])<<NormBias, S4=S0;
+	                        for (int i=0; i<(W_In1/4); i++) {
+					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC);
+	                               	S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4);
+					pIn+=4; pIn1+=4; pC+=4;
+				}
+				for (int f=4*(W_In1/4); f<W_In1; f++) {
+					int V0 = *pIn, V1 = *pIn1, C0 = *pC;
+					S0 += V0*C0; S4 += V1*C0;
+					pIn++; pIn1++; pC++;
+				}
+				*(pOut0+i) = AT_CLIP_POS_IMM(AT_SCALE(S0, pSc[i], pScN[i]), 7);
+				*(pOut1+i) = AT_CLIP_POS_IMM(AT_SCALE(S4, pSc[i], pScN[i]), 7);
+			}
+			gap_waitbarrier(0);
+		}
+		if (Wo&0x1) {
+			int c = Wo-1;
+		// for (int c=0; c<Wo; c++) {
 			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*FS))[i]=0;
 			if (Iter1&0x2) ((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
 			if (Iter1&0x1) ((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
@@ -1914,15 +2369,16 @@ void KerPar_MM_Conv2D_ReLU_HWC_SQ8(
 						for (int i=Lb; i<Rb; i++)
 							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
                         } else if (Iter>0)
-				for (int j=Tb; j<Db; j++) 
+				for (int j=Tb; j<Db; j++)
 					for (int i=Lb; i<Rb; i++)
 						ColBuff[(j+OffL)*InFeat*Fx+(i+OffC)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
+
 			PosC += Sx;
 			gap_waitbarrier(0);
-	                // for (int Line=First; Line<Last; Line++) {
+
 			int *pBias = Bias + First;
 			signed char *pC = Filter + W_In1*First;
-			signed char *pOut0 = Out+l*Wo*OutFeat + (c+0)*OutFeat+First;
+			signed char *pOut0 = Out+l*Wo*OutFeat + (c)*OutFeat+First;
 			unsigned char *pSc = Scale + First;
 			unsigned char *pScN = ScaleN + First;
 	                for (int Line=0; Line<IterOut/4; Line++) {
@@ -1949,15 +2405,9 @@ void KerPar_MM_Conv2D_ReLU_HWC_SQ8(
 					S3 += V0*C3;
 					pIn++; pC0++; pC1++; pC2++; pC3++;
 				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S2, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S3, Sc, ScN), 7); pOut0++;
+				v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
+						   AT_CLIP_POS_IMM(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				*((v4s *) (pOut0+4*Line)) = R1;
 	                }
 			for (int i=4*(IterOut/4); i<IterOut; i++) {
 				signed char *pIn = ColBuff;
@@ -1972,15 +2422,12 @@ void KerPar_MM_Conv2D_ReLU_HWC_SQ8(
 					S0 += V0*C0;
 					pIn++; pC++;
 				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7); pOut0++;
+				*(pOut0+i) = AT_CLIP_POS_IMM(AT_SCALE(S0, pSc[i], pScN[i]), 7);
 			}
 			gap_waitbarrier(0);
 		}
 		PosL += Sy;
 	}
-	gap_waitbarrier(0);
 }
 
 void Ker_MM_Conv2D_HWC_SQ8(
@@ -2654,7 +3101,9 @@ void KerPar_MM_Conv2D_DxDy_HWC_SQ8(
 
 	int FS = Fx*Fy;
 	int Tail = 2*((W_In1+7)/8);
+	signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail;
 	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
+	((int *)ColBuff1)[Tail-1] = 0; ((int *)ColBuff1)[Tail-2] = 0;
 	int PosL = Arg->FirstTile?(-PadT):0;
 	int DFx = Dx*Fx, DFy = Dy*Fy;
 
@@ -2664,44 +3113,132 @@ void KerPar_MM_Conv2D_DxDy_HWC_SQ8(
 	for (int l=0; l<Ho; l++) {
 		int PosC = -PadL;
 		int Tb = PosL, Db = PosL+DFy;
-		for (int c=0; c<Wo; c++) {
+		for (int c=0; c<(Wo/2); c++) {
+			for (int i=0; i<(Iter1/4); i++) {
+				((int *)(ColBuff+F*FS))[i]=0;
+				((int *)(ColBuff1+F*FS))[i]=0;
+			}
+			if (Iter1&0x2) {
+				((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
+				((short int *)(ColBuff1+F*FS))[Iter1/2-1]=0;
+			}
+			if (Iter1&0x1) {
+				((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
+				((signed char *)(ColBuff1+F*FS))[Iter1-1]=0;
+			}
+			int Lb = PosC, Rb = PosC+DFx;
+			int Lb1 = PosC+Sx, Rb1 = PosC+Sx+DFx;
+/*
+This part is more efficient but NOT WORKING ???? TOCHECK
+			if (Iter>=4) {
+				for (int f=0; f<(Iter/4); f++)
+					for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
+						for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+							if (j>=0 && j<H && i>=0 && i<W)
+								((int *)ColBuff+jj*InFeat*Fx+ii*InFeat + F)[f] = ((int *)In+j*W*InFeat + i*InFeat + F)[f];
+				if (Iter&0x2)
+					for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
+						for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+							if (j>=0 && j<H && i>=0 && i<W)
+								((short int *)ColBuff+jj*InFeat*Fx+ii*InFeat + F)[Iter/2-1] = ((short int *)In+j*W*InFeat + i*InFeat + F)[Iter/2-1];
+				if (Iter&0x1)
+					for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
+						for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+							if (j>=0 && j<H && i>=0 && i<W)
+								((signed char *)ColBuff+jj*InFeat*Fx+ii*InFeat + F)[Iter-1] = ((signed char *)In+j*W*InFeat + i*InFeat + F)[Iter-1];
+			} else if (Iter>=2) {
+				if (Iter&0x2)
+					for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
+						for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+							if (j>=0 && j<H && i>=0 && i<W)
+								((short int *)ColBuff+jj*InFeat*Fx+ii*InFeat + F)[Iter/2-1] = ((short int *)In+j*W*InFeat + i*InFeat + F)[Iter/2-1];
+				if (Iter&0x1)
+					for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
+						for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+							if (j>=0 && j<H && i>=0 && i<W)
+								((signed char *)ColBuff+jj*InFeat*Fx+ii*InFeat + F)[Iter-1] = ((signed char *)In+j*W*InFeat + i*InFeat + F)[Iter-1];				
+			} else if (Iter>0)
+				for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
+				       	for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+				       		if (j>=0 && j<H && i>=0 && i<W)
+							ColBuff[jj*InFeat*Fx+ii*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
+*/
+			for (int f=0; f<Iter; f++)
+				for (int j=Tb, jj=0; j<Db; j+=Dy, jj++) {
+				       	for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+				       		if (j>=0 && j<H && i>=0 && i<W) 
+							ColBuff[jj*InFeat*Fx+ii*InFeat + F+f] = In[j*W*InFeat + i*InFeat + F+f];
+				       	for (int i=Lb1, ii=0; i<Rb1; i+=Dx, ii++)
+				       		if (j>=0 && j<H && i>=0 && i<W) 
+							ColBuff1[jj*InFeat*Fx+ii*InFeat + F+f] = In[j*W*InFeat + i*InFeat + F+f];
+				}
+			PosC += 2*Sx;
+			gap_waitbarrier(0);
+
+			int *pBias = Bias + First;
+			signed char *pC = Filter + W_In1*First;
+			signed char *pOut0 = Out+l*Wo*OutFeat + (2*c+0)*OutFeat+First;
+			signed char *pOut1 = Out+l*Wo*OutFeat + (2*c+1)*OutFeat+First;
+			unsigned char *pSc = Scale + First;
+			unsigned char *pScN = ScaleN + First;
+	                for (int Line=0; Line<IterOut/4; Line++) {
+				signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
+				pC=pC3+W_In1;
+	                        int S0 = (pBias[4*Line  ])<<NormBias, S4=S0;
+	                        int S1 = (pBias[4*Line+1])<<NormBias, S5=S1;
+	                        int S2 = (pBias[4*Line+2])<<NormBias, S6=S2;
+	                        int S3 = (pBias[4*Line+3])<<NormBias, S7=S3;
+				signed char *pIn = ColBuff;
+				signed char *pIn1 = ColBuff1;
+	                        for (int i=0; i<(W_In1/4); i++) {
+					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
+	                                S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4); 
+	                                S1 = gap_sumdotp4(V0, C1, S1); S5 = gap_sumdotp4(V1, C1, S5); 
+	                                S2 = gap_sumdotp4(V0, C2, S2); S6 = gap_sumdotp4(V1, C2, S6); 
+	                                S3 = gap_sumdotp4(V0, C3, S3); S7 = gap_sumdotp4(V1, C3, S7); 
+					pIn+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
+	                        }
+				for (int f=4*(W_In1/4); f<W_In1; f++) {
+					int V0 = *pIn, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
+					S0 += V0*C0; S4 += V1*C0;
+					S1 += V0*C1; S5 += V1*C1;
+					S2 += V0*C2; S6 += V1*C2;
+					S3 += V0*C3; S7 += V1*C3;
+					pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
+				}
+				v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
+						   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7),
+						   gap_clip(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				*((v4s *) (pOut0+4*Line)) = R1;
+				*((v4s *) (pOut1+4*Line)) = R2;
+	                }
+			for (int i=4*(IterOut/4); i<IterOut; i++) {
+				signed char *pIn = ColBuff;
+				signed char *pIn1 = ColBuff1;
+	                        int S0 = (pBias[i])<<NormBias, S4=S0;
+	                        for (int i=0; i<(W_In1/4); i++) {
+					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC);
+	                               	S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4);
+					pIn+=4; pIn1+=4; pC+=4;
+				}
+				for (int f=4*(W_In1/4); f<W_In1; f++) {
+					int V0 = *pIn, V1 = *pIn1, C0 = *pC;
+					S0 += V0*C0; S4 += V1*C0;
+					pIn++; pIn1++; pC++;
+				}
+				*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
+				*(pOut1+i) = gap_clip(AT_SCALE(S4, pSc[i], pScN[i]), 7);
+			}
+			gap_waitbarrier(0);
+		}
+		if (Wo&0x1) {
+			int c = Wo-1;
 			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*FS))[i]=0;
 			if (Iter1&0x2) ((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
 			if (Iter1&0x1) ((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
 			int Lb = PosC, Rb = PosC+DFx;
-			// This part is more efficient but NOT WORKING ???? TOCHECK
-			// if (Iter>=4) {
-			// 	for (int f=0; f<(Iter/4); f++)
-			// 		for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
-			// 			for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
-			// 				if (j>=0 && j<H && i>=0 && i<W)
-			// 					((int *)ColBuff+jj*InFeat*Fx+ii*InFeat + F)[f] = ((int *)In+j*W*InFeat + i*InFeat + F)[f];
-			// 	if (Iter&0x2)
-			// 		for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
-			// 			for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
-			// 				if (j>=0 && j<H && i>=0 && i<W)
-			// 					((short int *)ColBuff+jj*InFeat*Fx+ii*InFeat + F)[Iter/2-1] = ((short int *)In+j*W*InFeat + i*InFeat + F)[Iter/2-1];
-			// 	if (Iter&0x1)
-			// 		for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
-			// 			for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
-			// 				if (j>=0 && j<H && i>=0 && i<W)
-			// 					((signed char *)ColBuff+jj*InFeat*Fx+ii*InFeat + F)[Iter-1] = ((signed char *)In+j*W*InFeat + i*InFeat + F)[Iter-1];
-			// } else if (Iter>=2) {
-			// 	if (Iter&0x2)
-			// 		for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
-			// 			for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
-			// 				if (j>=0 && j<H && i>=0 && i<W)
-			// 					((short int *)ColBuff+jj*InFeat*Fx+ii*InFeat + F)[Iter/2-1] = ((short int *)In+j*W*InFeat + i*InFeat + F)[Iter/2-1];
-			// 	if (Iter&0x1)
-			// 		for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
-			// 			for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
-			// 				if (j>=0 && j<H && i>=0 && i<W)
-			// 					((signed char *)ColBuff+jj*InFeat*Fx+ii*InFeat + F)[Iter-1] = ((signed char *)In+j*W*InFeat + i*InFeat + F)[Iter-1];				
-			// } else if (Iter>0)
-			// 	for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
-			// 	       	for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
-			// 	       		if (j>=0 && j<H && i>=0 && i<W)
-			// 				ColBuff[jj*InFeat*Fx+ii*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
+
 			for (int f=0; f<Iter; f++)
 				for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
 				       	for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
@@ -2709,7 +3246,7 @@ void KerPar_MM_Conv2D_DxDy_HWC_SQ8(
 							ColBuff[jj*InFeat*Fx+ii*InFeat + F+f] = In[j*W*InFeat + i*InFeat + F+f];
 			PosC += Sx;
 			gap_waitbarrier(0);
-			// for (int Line=First; Line<Last; Line++) {
+
 			int *pBias = Bias + First;
 			signed char *pC = Filter + W_In1*First;
 			signed char *pOut0 = Out+l*Wo*OutFeat + (c+0)*OutFeat+First;
@@ -2851,6 +3388,7 @@ void KerPar_MM_Conv2D_DxDy_ReLU_SQ8(
 	gap_waitbarrier(0);
 }
 
+#if 0
 void KerPar_MM_ConvDW1D_HWC_SQ8(
 	Ker_MM_Conv_SQ8_T *Arg
 	)
@@ -2998,4 +3536,5 @@ void KerPar_MM_ConvDW2D_HWC_SQ8(
 	}
 	gap_waitbarrier(0);
 }
+#endif
 #pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_BasicKernels_fp16.h b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_BasicKernels_fp16.h
index e6189463f..bca36c985 100644
--- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_BasicKernels_fp16.h
+++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_BasicKernels_fp16.h
@@ -583,13 +583,13 @@ extern void KerParMatMulLeakyreluSxSy_fp16(KerMatMul_fp16_T *Arg);
 extern void KerParMatMulSmallFeat_fp16(KerMatMul_fp16_T *Arg);
 extern void KerParMatMulSmallFeatReLU_fp16(KerMatMul_fp16_T *Arg);
 extern void KerParMatMulSmallFeatReLUN_fp16(KerMatMul_fp16_T *Arg);
-extern void KerParMatMulSwishSmallFeat_fp16(KerMatMul_fp16_T *Arg);
-extern void KerParMatMulHSwishSmallFeat_fp16(KerMatMul_fp16_T *Arg);
+extern void KerParMatMulSmallFeatSwish_fp16(KerMatMul_fp16_T *Arg);
+extern void KerParMatMulSmallFeatHSwish_fp16(KerMatMul_fp16_T *Arg);
 extern void KerParMatMulSigmoidSmallFeat_fp16(KerMatMul_fp16_T *Arg);
-extern void KerParMatMulHSigmoidSmallFeat_fp16(KerMatMul_fp16_T *Arg);
-extern void KerParMatMulTanhSmallFeat_fp16(KerMatMul_fp16_T *Arg);
-extern void KerParMatMulHTanhSmallFeat_fp16(KerMatMul_fp16_T *Arg);
-extern void KerParMatMulLeakyreluSmallFeat_fp16(KerMatMul_fp16_T *Arg);
+extern void KerParMatMulSmallFeatHSigmoid_fp16(KerMatMul_fp16_T *Arg);
+extern void KerParMatMulSmallFeatTanh_fp16(KerMatMul_fp16_T *Arg);
+extern void KerParMatMulSmallFeatHTanh_fp16(KerMatMul_fp16_T *Arg);
+extern void KerParMatMulSmallFeatLeakyrelu_fp16(KerMatMul_fp16_T *Arg);
 
 /******************************************************************************************************************************/
 /******************* SOFT MAX *************************************************************************************************/
diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c
index fa2c466d1..ece4f10f6 100644
--- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c
+++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c
@@ -6799,7 +6799,7 @@ void KerParMatMulSmallFeatReLUN_fp16(KerMatMul_fp16_T *Arg)
 	gap_waitbarrier(0);
 }
 
-void KerParMatMulSwishSmallFeat_fp16(KerMatMul_fp16_T *Arg)
+void KerParMatMulSmallFeatSwish_fp16(KerMatMul_fp16_T *Arg)
 
 {
 	F16 * __restrict__ In1 = Arg->In1;
@@ -6833,7 +6833,7 @@ void KerParMatMulSwishSmallFeat_fp16(KerMatMul_fp16_T *Arg)
 	gap_waitbarrier(0);
 }
 
-void KerParMatMulHSwishSmallFeat_fp16(KerMatMul_fp16_T *Arg)
+void KerParMatMulSmallFeatHSwish_fp16(KerMatMul_fp16_T *Arg)
 
 {
 	F16 * __restrict__ In1 = Arg->In1;
@@ -6867,7 +6867,7 @@ void KerParMatMulHSwishSmallFeat_fp16(KerMatMul_fp16_T *Arg)
 	gap_waitbarrier(0);
 }
 
-void KerParMatMulSigmoidSmallFeat_fp16(KerMatMul_fp16_T *Arg)
+void KerParMatMulSmallFeatSigmoid_fp16(KerMatMul_fp16_T *Arg)
 
 {
 	F16 * __restrict__ In1 = Arg->In1;
@@ -6901,7 +6901,7 @@ void KerParMatMulSigmoidSmallFeat_fp16(KerMatMul_fp16_T *Arg)
 	gap_waitbarrier(0);
 }
 
-void KerParMatMulHSigmoidSmallFeat_fp16(KerMatMul_fp16_T *Arg)
+void KerParMatMulSmallFeatHSigmoid_fp16(KerMatMul_fp16_T *Arg)
 
 {
 	F16 * __restrict__ In1 = Arg->In1;
@@ -6936,7 +6936,7 @@ void KerParMatMulHSigmoidSmallFeat_fp16(KerMatMul_fp16_T *Arg)
 	gap_waitbarrier(0);
 }
 
-void KerParMatMulTanhSmallFeat_fp16(KerMatMul_fp16_T *Arg)
+void KerParMatMulSmallFeatTanh_fp16(KerMatMul_fp16_T *Arg)
 
 {
 	F16 * __restrict__ In1 = Arg->In1;
@@ -6970,7 +6970,7 @@ void KerParMatMulTanhSmallFeat_fp16(KerMatMul_fp16_T *Arg)
 	gap_waitbarrier(0);
 }
 
-void KerParMatMulHTanhSmallFeat_fp16(KerMatMul_fp16_T *Arg)
+void KerParMatMulSmallFeatHTanh_fp16(KerMatMul_fp16_T *Arg)
 
 {
 	F16 * __restrict__ In1 = Arg->In1;
@@ -7004,7 +7004,7 @@ void KerParMatMulHTanhSmallFeat_fp16(KerMatMul_fp16_T *Arg)
 	gap_waitbarrier(0);
 }
 
-void KerParMatMulLeakyreluSmallFeat_fp16(KerMatMul_fp16_T *Arg)
+void KerParMatMulSmallFeatLeakyrelu_fp16(KerMatMul_fp16_T *Arg)
 
 {
 	F16 * __restrict__ In1 = Arg->In1;
diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c
index f5137a37f..5af1deb3a 100644
--- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c
+++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c
@@ -17,7 +17,7 @@ static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int
         return Chunk;
 }
 
-static int FirstDefinedOutput(unsigned int F, unsigned int Pad, unsigned int Stride)
+static int FirstDefinedOutput(int F, int Pad, int Stride)
 
 {
         // k*S - (F-1)/2 >=0 => k >= (((F-1)/2) + S-1)/S
@@ -25,12 +25,12 @@ static int FirstDefinedOutput(unsigned int F, unsigned int Pad, unsigned int Str
         return ((Pad+Stride-1)/Stride);
 }
 
-static int LastDefinedOutput(unsigned int DimIn, unsigned int F, unsigned int PadL, unsigned int Stride)
+static int LastDefinedOutput(int DimIn, int F, int PadL, int Stride, int D)
 
 {
         // k*S + ((F-1)/2 - PadL + F/2) < Dim  => k < (Dim-((F-1)/2 - PadL + (F/2)) + S-1)/S
 
-        return ((DimIn - ((F-1)/2 - PadL + (F/2)) + Stride-1)/Stride);
+        return ((DimIn - ((F-1)/2*D - PadL + (F/2)*D) + Stride-1)/Stride);
 }
 
 void KerPar_MM_Conv1D_fp16(
@@ -266,6 +266,7 @@ void KerPar_MM_Conv1D_HWC_fp16(
 	int PosL = 0;
 	int Iter = L-F;
 	int Iter1 = Iter*Fx;
+	int IterOut = Last - First;
 	for (int l=0; l<Ho; l++) {
 		int PosC = -PadL;
 		for (int c=0; c<Wo; c++) {
@@ -280,17 +281,53 @@ void KerPar_MM_Conv1D_HWC_fp16(
 			} else for (int i=Lb; i<Rb; i++) ColBuff[(i+Off)*InFeat + F] = In[PosL*W*InFeat + i*InFeat + F];
 			PosC += Sx;
 			gap_waitbarrier(0);
-	                for (int Line=First; Line<Last; Line++) {
-	                        F16V *VIn1 = (F16V *) (&Filter[Line*W_In1 + 0]);
-	                        F16V S0 = (F16V){Bias[Line],0.0}, S1 = (F16V) 0;
-	                        for (int i=0; i<((W_In1+3)/4); i++) {
-	                                F16V V0 = VIn1[2*i], V1 = VIn1[2*i+1];
-					F16V C0 = VBuff[2*i], C1 = VBuff[2*i+1];
-	                                S0 += V0 * C0;
-	                                S1 += V1 * C1;
+
+	                // for (int Line=First; Line<Last; Line++) {
+			F16 *pBias = Bias + First;
+			F16 *pC = Filter + W_In1*First;
+			F16 *pOut0 = Out+l*Wo*OutFeat + (c+0)*OutFeat+First;
+	                for (int Line=0; Line<IterOut/4; Line++) {
+				F16 *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
+				pC=pC3+W_In1;
+	                        F16V S0 = (F16V){*pBias,0.0}; pBias++;
+	                        F16V S1 = (F16V){*pBias,0.0}; pBias++;
+	                        F16V S2 = (F16V){*pBias,0.0}; pBias++;
+	                        F16V S3 = (F16V){*pBias,0.0}; pBias++;
+				F16 *pIn = ColBuff;
+	                        for (int i=0; i<(W_In1/2); i++) {
+					F16V V0 = *((F16V *)pIn), C0 = *((F16V *)pC0), C1 = *((F16V *)pC1), C2 = *((F16V *)pC2), C3 = *((F16V *)pC3);
+					S0 += V0*C0;
+					S1 += V0*C1;
+					S2 += V0*C2;
+					S3 += V0*C3;
+					pIn+=2; pC0+=2; pC1+=2; pC2+=2; pC3+=2;
 	                        }
-	                        Out[l*Wo*OutFeat + c*OutFeat + Line] = S0[0]+S0[1]+S1[0]+S1[1];
+				if (W_In1&0x1) {
+					F16V V0 = (F16V){*pIn,0.0}, C0 = *((F16V *)pC0), C1 = *((F16V *)pC1), C2 = *((F16V *)pC2), C3 = *((F16V *)pC3);
+					S0 += V0*C0;
+					S1 += V0*C1;
+					S2 += V0*C2;
+					S3 += V0*C3;
+				}
+				*pOut0 = S0[0]+S0[1]; pOut0++;
+				*pOut0 = S1[0]+S1[1]; pOut0++;
+				*pOut0 = S2[0]+S2[1]; pOut0++;
+				*pOut0 = S3[0]+S3[1]; pOut0++;
 	                }
+			for (int i=4*(IterOut/4); i<IterOut; i++) {
+				F16 *pIn = ColBuff;
+	                        F16V S0 = (F16V){*pBias,0.0}; pBias++;
+	                        for (int i=0; i<(W_In1/2); i++) {
+					F16V V0 = *((F16V *)pIn), C0 = *((F16V *)pC);
+					S0 += V0*C0;
+					pIn+=2; pC+=2;
+				}
+				if (W_In1&0x1) {
+					F16V V0 = (F16V){*pIn,0.0}, C0 = *((F16V *)pC);
+					S0 += V0*C0;
+				}
+				*pOut0 = S0[0]+S0[1]; pOut0++;
+			}
 			gap_waitbarrier(0);
 		}
 		PosL += Sy;
@@ -395,44 +432,80 @@ void KerPar_MM_Conv1D_DxDy_HWC_fp16(
 	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
 	
 	int DFx = Dx*(Fx-1)+1;
-	int Prec=10;
-	int InvDx = ((1<<Prec)+Dx-1)/Dx;
 
 	int PosL = 0;
 	int Iter = L-F;
 	int Iter1 = Iter*Fx;
+	int IterOut = Last - First;
 	for (int l=0; l<Ho; l++) {
 		int PosC = -PadL;
 		for (int c=0; c<Wo; c++) {
 			for (int i=0; i<(Iter1/2); i++) ((int *)(ColBuff+F*Fx))[i]=0;
 			if (Iter1&0x1) ((short int *)(ColBuff+F*Fx))[Iter1-1]=0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+DFx, W);
-			int OffBuffX = Max(0, gap_mulsN(-PosC+Dx-1, InvDx, Prec));
-			int OffInX = OffBuffX?(Dx*OffBuffX+PosC):0;
-			int IterX = gap_mulsN(Rb-Lb-1, InvDx, Prec) + 1;
+			int Lb = PosC, Rb = PosC+DFx;
 			if (Iter>=2) {
 				for (int f=0; f<(Iter/2); f++)
-					for (int i=0; i<IterX; i++)
-						((int *)(ColBuff+(i+OffBuffX)*InFeat+F))[f] = ((int *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[f];
+				       	for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+				       		if (i>=0 && i<W) 
+							((int *) (ColBuff + ii*InFeat + F))[f] = ((int *)(In+i*InFeat + F))[f];
 				if (Iter&0x1)
-					for (int i=0; i<IterX; i++)
-						((short int *)(ColBuff+(i+OffBuffX)*InFeat+F))[Iter-1] = ((short int *)(In+PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat+F))[Iter-1];
-			} else
-				for (int i=0; i<IterX; i++)
-					ColBuff[(i+OffBuffX)*InFeat + F] = In[PosL*W*InFeat + (i*Dx+OffInX+Lb)*InFeat + F];
+				       	for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+				       		if (i>=0 && i<W) 
+							((short int *) (ColBuff + ii*InFeat + F))[Iter-1] = ((short int *)(In+i*InFeat + F))[Iter-1];
+			} else {
+			       	for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+			       		if (i>=0 && i<W) 
+						ColBuff[ii*InFeat + F] = In[i*InFeat + F];
+			}
 			PosC += Sx;
 			gap_waitbarrier(0);
-	                for (int Line=First; Line<Last; Line++) {
-	                        F16V *VIn1 = (F16V *) (&Filter[Line*W_In1 + 0]);
-	                        F16V S0 = (F16V){Bias[Line],0.0}, S1 = (F16V) 0;
-	                        for (int i=0; i<((W_In1+3)/4); i++) {
-	                                F16V V0 = VIn1[2*i], V1 = VIn1[2*i+1];
-					F16V C0 = VBuff[2*i], C1 = VBuff[2*i+1];
-	                                S0 += V0 * C0;
-	                                S1 += V1 * C1;
+
+	                // for (int Line=First; Line<Last; Line++) {
+			F16 *pBias = Bias + First;
+			F16 *pC = Filter + W_In1*First;
+			F16 *pOut0 = Out+l*Wo*OutFeat + (c+0)*OutFeat+First;
+	                for (int Line=0; Line<IterOut/4; Line++) {
+				F16 *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
+				pC=pC3+W_In1;
+	                        F16V S0 = (F16V){*pBias,0.0}; pBias++;
+	                        F16V S1 = (F16V){*pBias,0.0}; pBias++;
+	                        F16V S2 = (F16V){*pBias,0.0}; pBias++;
+	                        F16V S3 = (F16V){*pBias,0.0}; pBias++;
+				F16 *pIn = ColBuff;
+	                        for (int i=0; i<(W_In1/2); i++) {
+					F16V V0 = *((F16V *)pIn), C0 = *((F16V *)pC0), C1 = *((F16V *)pC1), C2 = *((F16V *)pC2), C3 = *((F16V *)pC3);
+					S0 += V0*C0;
+					S1 += V0*C1;
+					S2 += V0*C2;
+					S3 += V0*C3;
+					pIn+=2; pC0+=2; pC1+=2; pC2+=2; pC3+=2;
 	                        }
-	                        Out[l*Wo*OutFeat + c*OutFeat + Line] = S0[0]+S0[1]+S1[0]+S1[1];
+				if (W_In1&0x1) {
+					F16V V0 = (F16V){*pIn,0.0}, C0 = *((F16V *)pC0), C1 = *((F16V *)pC1), C2 = *((F16V *)pC2), C3 = *((F16V *)pC3);
+					S0 += V0*C0;
+					S1 += V0*C1;
+					S2 += V0*C2;
+					S3 += V0*C3;
+				}
+				*pOut0 = S0[0]+S0[1]; pOut0++;
+				*pOut0 = S1[0]+S1[1]; pOut0++;
+				*pOut0 = S2[0]+S2[1]; pOut0++;
+				*pOut0 = S3[0]+S3[1]; pOut0++;
 	                }
+			for (int i=4*(IterOut/4); i<IterOut; i++) {
+				F16 *pIn = ColBuff;
+	                        F16V S0 = (F16V){*pBias,0.0}; pBias++;
+	                        for (int i=0; i<(W_In1/2); i++) {
+					F16V V0 = *((F16V *)pIn), C0 = *((F16V *)pC);
+					S0 += V0*C0;
+					pIn+=2; pC+=2;
+				}
+				if (W_In1&0x1) {
+					F16V V0 = (F16V){*pIn,0.0}, C0 = *((F16V *)pC);
+					S0 += V0*C0;
+				}
+				*pOut0 = S0[0]+S0[1]; pOut0++;
+			}
 			gap_waitbarrier(0);
 		}
 		PosL += Sy;
@@ -1156,55 +1229,86 @@ void KerPar_MM_Conv2D_DxDy_HWC_fp16(
 	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
 	int PosL = Arg->FirstTile?(-PadT):0;
 	int DFx = Dx*(Fx-1)+1, DFy =  Dy*(Fy-1)+1;
-	int Prec=10;
-	int InvDx = ((1<<Prec)+Dx-1)/Dx;
-	int InvDy = ((1<<Prec)+Dy-1)/Dy;
+	// int Prec=10;
+	// int InvDx = ((1<<Prec)+Dx-1)/Dx;
+	// int InvDy = ((1<<Prec)+Dy-1)/Dy;
 	int Iter = L-F;
 	int Iter1 = Iter*FS;
+	int IterOut = Last - First;
 
 	for (int l=0; l<Ho; l++) {
 		int PosC = -PadL;
-		int Tb = Max(PosL, 0), Db = Min(PosL+DFy, H);
-		int OffLBuffY = Max(0, gap_mulsN(-PosL+Dy-1, InvDy, Prec));
-		int OffLInY = OffLBuffY?(Dy*OffLBuffY+PosL):0;
+		int Tb = PosL, Db = PosL+DFy;
 		for (int c=0; c<Wo; c++) {
 			for (int i=0; i<(Iter1/2); i++) ((int *)(ColBuff+F*FS))[i]=0;
 			if (Iter1&0x1) ((short int *)(ColBuff+F*FS))[Iter1-1]=0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+DFx, W);
-			int OffCBuffX = (Lb==0)?Max(0, gap_mulsN(-PosC+Dx-1, InvDx, Prec)):0;
-			int OffCInX = OffCBuffX?(Dx*OffCBuffX+PosC):0;
-			int IterY = gap_mulsN(Db-Tb-1, InvDy, Prec) + 1;
-			int IterX = gap_mulsN(Rb-Lb-1, InvDx, Prec) + 1;
-                        if (Iter>=2) {
-                                for (int f=0; f<(Iter/2); f++)
-					for (int j=Tb; j<Db; j++)
-                                        	for (int i=Lb; i<Rb; i++)
-							((int *)(ColBuff+(j+OffLBuffY)*InFeat*Fx+(i+OffCBuffX)*InFeat+F))[f] =
-								((int *)(In+(Tb+j*Dy+OffLInY)*W*InFeat + (Lb+i*Dx+OffCInX)*InFeat+F))[f];
-                                if (Iter&0x1)
-					for (int j=Tb; j<Db; j++)
-						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffLBuffY)*InFeat*Fx+(i+OffCBuffX)*InFeat+F))[Iter-1] =
-								((short int *)(In+(Tb+j*Dy+OffLInY)*W*InFeat + (Lb+i*Dx+OffCInX)*InFeat+F))[Iter-1];
-                        } else
-				for (int j=Tb; j<Db; j++) 
-					for (int i=Lb; i<Rb; i++)
-						ColBuff[(j+OffLBuffY)*InFeat*Fx+(i+OffCBuffX)*InFeat + F] =
-							In[(Tb+j*Dy+OffLInY)*W*InFeat + (Lb+i*Dx+OffCInX)*InFeat + F];
-
+			int Lb = PosC, Rb = PosC+DFx;
+			if (Iter>=2) {
+				for (int f=0; f<(Iter/2); f++)
+					for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
+					       	for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+					       		if (j>=0 && j<H && i>=0 && i<W) 
+								((int *) (ColBuff + jj*InFeat*Fx+ii*InFeat + F))[f] = ((int *)(In+j*W*InFeat + i*InFeat + F))[f];
+				if (Iter&0x1)
+					for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
+					       	for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+					       		if (j>=0 && j<H && i>=0 && i<W) 
+								((short int *) (ColBuff + jj*InFeat*Fx+ii*InFeat + F))[Iter-1] = ((short int *)(In+j*W*InFeat + i*InFeat + F))[Iter-1];
+			} else {
+				for (int j=Tb, jj=0; j<Db; j+=Dy, jj++)
+				       	for (int i=Lb, ii=0; i<Rb; i+=Dx, ii++)
+				       		if (j>=0 && j<H && i>=0 && i<W) 
+							ColBuff[jj*InFeat*Fx+ii*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
+			}
 			PosC += Sx;
 			gap_waitbarrier(0);
-	                for (int Line=First; Line<Last; Line++) {
-	                        F16V *VIn1 = (F16V *) (&Filter[Line*W_In1 + 0]);
-	                        F16V S0 = (F16V){Bias[Line],0.0};
-	                        for (int i=0; i<((W_In1+3)/4); i++) {
-	                                F16V V0 = VIn1[2*i], V1 = VIn1[2*i+1];
-					F16V C0 = VBuff[2*i], C1 = VBuff[2*i+1];
-	                                S0 += V0 * C0;
-	                                S0 += V1 * C1;
+
+	                // for (int Line=First; Line<Last; Line++) {
+			F16 *pBias = Bias + First;
+			F16 *pC = Filter + W_In1*First;
+			F16 *pOut0 = Out+l*Wo*OutFeat + (c+0)*OutFeat+First;
+	                for (int Line=0; Line<IterOut/4; Line++) {
+				F16 *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
+				pC=pC3+W_In1;
+	                        F16V S0 = (F16V){*pBias,0.0}; pBias++;
+	                        F16V S1 = (F16V){*pBias,0.0}; pBias++;
+	                        F16V S2 = (F16V){*pBias,0.0}; pBias++;
+	                        F16V S3 = (F16V){*pBias,0.0}; pBias++;
+				F16 *pIn = ColBuff;
+	                        for (int i=0; i<(W_In1/2); i++) {
+					F16V V0 = *((F16V *)pIn), C0 = *((F16V *)pC0), C1 = *((F16V *)pC1), C2 = *((F16V *)pC2), C3 = *((F16V *)pC3);
+					S0 += V0*C0;
+					S1 += V0*C1;
+					S2 += V0*C2;
+					S3 += V0*C3;
+					pIn+=2; pC0+=2; pC1+=2; pC2+=2; pC3+=2;
 	                        }
-	                        Out[l*Wo*OutFeat + c*OutFeat + Line] = S0[0]+S0[1];
+				if (W_In1&0x1) {
+					F16V V0 = (F16V){*pIn,0.0}, C0 = *((F16V *)pC0), C1 = *((F16V *)pC1), C2 = *((F16V *)pC2), C3 = *((F16V *)pC3);
+					S0 += V0*C0;
+					S1 += V0*C1;
+					S2 += V0*C2;
+					S3 += V0*C3;
+				}
+				*pOut0 = S0[0]+S0[1]; pOut0++;
+				*pOut0 = S1[0]+S1[1]; pOut0++;
+				*pOut0 = S2[0]+S2[1]; pOut0++;
+				*pOut0 = S3[0]+S3[1]; pOut0++;
 	                }
+			for (int i=4*(IterOut/4); i<IterOut; i++) {
+				F16 *pIn = ColBuff;
+	                        F16V S0 = (F16V){*pBias,0.0}; pBias++;
+	                        for (int i=0; i<(W_In1/2); i++) {
+					F16V V0 = *((F16V *)pIn), C0 = *((F16V *)pC);
+					S0 += V0*C0;
+					pIn+=2; pC+=2;
+				}
+				if (W_In1&0x1) {
+					F16V V0 = (F16V){*pIn,0.0}, C0 = *((F16V *)pC);
+					S0 += V0*C0;
+				}
+				*pOut0 = S0[0]+S0[1]; pOut0++;
+			}
 			gap_waitbarrier(0);
 		}
 		PosL += Sy;
diff --git a/tools/autotiler_v3/DSP_Libraries/FloatDefines.h b/tools/autotiler_v3/DSP_Libraries/FloatDefines.h
index 46b19828f..27317e9ad 100644
--- a/tools/autotiler_v3/DSP_Libraries/FloatDefines.h
+++ b/tools/autotiler_v3/DSP_Libraries/FloatDefines.h
@@ -110,5 +110,6 @@
 	#define Cvt_v2h_v2ah(a) (v2ah) (a)
 	#define Cvt_v2ah_v2h(a) (v2h) (a)
 #endif //__gap9__
+#define Clipf32(a, upper, lower) ((float) Maxf32(Minf32((a), (upper)), (lower)))
 
 #endif
diff --git a/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/GenMFCCLUT.py b/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/GenMFCCLUT.py
index 6581ab8ef..947e2d625 100644
--- a/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/GenMFCCLUT.py
+++ b/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/GenMFCCLUT.py
@@ -31,7 +31,7 @@ def create_parser():
 	parser.add_argument('--name_suffix', default="", type=str)
 	parser.add_argument('--frame_size', required="--params_json" not in sys.argv, type=int,
 						help="size in number of samples of one frame")
-	parser.add_argument('--frame_step', required="--params_json" not in sys.argv, type=int,
+	parser.add_argument('--frame_step', type=int, default=0,
 						help="step in number of samples between two consecutive frames")
 	parser.add_argument('--win_func', default="hanning", type=str,
 						help="numpy window function (e.g. hanning)")
diff --git a/tools/autotiler_v3/Emulation/GapBuiltins.h b/tools/autotiler_v3/Emulation/GapBuiltins.h
index 0df6a71cf..03f22c167 100644
--- a/tools/autotiler_v3/Emulation/GapBuiltins.h
+++ b/tools/autotiler_v3/Emulation/GapBuiltins.h
@@ -230,6 +230,16 @@ static inline unsigned int ExtInsMaskSafe(unsigned int Size, unsigned int Offset
 #define gap_norm(x, scale)			__builtin_pulp_addN((x), 0, (scale))
 #define gap_norm_reg(x, scale)			__builtin_pulp_addN_r((x), 0, (scale))
 
+/* Floating point */
+#define gap_f32min(a,b)				__builtin_pulp_f32min((a), (b))
+#define gap_f32max(a,b)				__builtin_pulp_f32max((a), (b))
+#define gap_f32sqrt(a)				__builtin_pulp_f32sqrt((a))
+#define gap_f32abs(a)				__builtin_pulp_f32abs((a))
+#define gap_f32rmm(a)				__builtin_pulp_rintsf2((a))
+#define gap_f32rdn(a)				__builtin_pulp_rdownsf2((a))
+#define gap_f32rup(a)				__builtin_pulp_rupsf2((a))
+
+
 #else
 /* Emulation */
 
@@ -452,6 +462,17 @@ static int _VitT0_Flag, _VitT1_Flag;
 #define gap_norm(x, scale)			((int)(x)>>(scale))
 #define gap_norm_reg(x, scale)			((int)(x)>>(scale))
 
+/* Floating point */
+#include <math.h>
+#define gap_f32min(a,b)		fminf((a), (b))
+#define gap_f32max(a,b)		fmaxf((a), (b))
+#define gap_f32sqrt(a)		sqrtf((a))
+#define gap_f32abs(a)		fabsf((a))
+#define gap_f32rmm(a)		(((a)>=0)?((int)((a)+0.5f)):(-((int)(-(a)+0.5f))))
+#define gap_f32rdn(a)		floorf((a))
+#define gap_f32rup(a)		ceilf((a))
+
+
 #endif
 
 #define FIX2FP(Val, Precision)		((float) (Val) / (float) (1<<(Precision)))
diff --git a/tools/autotiler_v3/Generators/BilinearResizes/ResizeBasicKernels.c b/tools/autotiler_v3/Generators/BilinearResizes/ResizeBasicKernels.c
index ccbbbd07f..a78820daa 100644
--- a/tools/autotiler_v3/Generators/BilinearResizes/ResizeBasicKernels.c
+++ b/tools/autotiler_v3/Generators/BilinearResizes/ResizeBasicKernels.c
@@ -281,3 +281,88 @@ void KerResizeNearestNeighborSigned_Q16(KerResizeSigned16_ArgT *Arg)
         }
         gap_waitbarrier(0);
 }
+
+#ifdef __gap9__
+void KerResizeBilinear_fp16(KerResize_fp16_ArgT *Arg)
+
+{
+        F16 * __restrict__ In       = Arg->In;
+        unsigned int Win            = Arg->Win;
+        unsigned int Hin            = Arg->Hin;
+        F16 * __restrict__ Out      = Arg->Out;
+        unsigned int Wout           = Arg->Wout;
+        unsigned int Hout           = Arg->Hout;
+        unsigned int HTileOut       = Arg->HTileOut;
+        unsigned int FirstLineIndex = Arg->FirstLineIndex;
+
+        unsigned int CoreId = gap_coreid();
+        unsigned int ChunkCell = ChunkSize(Wout);
+        unsigned int First = CoreId*ChunkCell, Last  = Min(Wout, First+ChunkCell);
+
+        F16 WStep = ((F16) (Win-1))/Wout;
+        F16 HStep = ((F16) (Hin-1))/Hout;
+
+        unsigned int x, y;
+        F16 hCoeff = ((F16) HStep)*FirstLineIndex;
+        F16 BaseY = hCoeff;
+        for (y = 0 ; y < HTileOut ; y++) {
+                int offsetY = (int) (hCoeff - BaseY);
+                F16 hc2 = hCoeff;
+                F16 hc1 = 1.0 - hc2;
+                // unsigned int wCoeff = 0;
+                F16 wCoeff = First*WStep;
+
+                // for (x = 0 ; x < Wout ; x++) {
+                for (x = First ; x < Last ; x++) {
+                        int offsetX = (int) wCoeff;
+                        F16 wc2 = wCoeff;
+                        F16 wc1 = 1.0 - wc2;
+                        F16 P1 = In[offsetY*Win       + offsetX    ];
+                        F16 P2 = In[(offsetY + 1)*Win + offsetX    ];
+                        F16 P3 = In[offsetY*Win       + offsetX + 1];
+                        F16 P4 = In[(offsetY + 1)*Win + offsetX + 1];
+
+                        Out[y*Wout + x] = ((P1*hc1 + P2*hc2)*wc1 + (P3*hc1 + P4*hc2)*wc2);
+                        wCoeff += WStep;
+                }
+                hCoeff += HStep;
+        }
+        gap_waitbarrier(0);
+}
+
+void KerResizeNearestNeighbor_fp16(KerResize_fp16_ArgT *Arg)
+
+{
+        F16 * __restrict__ In       = Arg->In;
+        unsigned int Win            = Arg->Win;
+        unsigned int Hin            = Arg->Hin;
+        F16 * __restrict__ Out      = Arg->Out;
+        unsigned int Wout           = Arg->Wout;
+        unsigned int Hout           = Arg->Hout;
+        unsigned int HTileOut       = Arg->HTileOut;
+        unsigned int FirstLineIndex = Arg->FirstLineIndex;
+
+        unsigned int CoreId = gap_coreid();
+        unsigned int ChunkCell = ChunkSize(Wout);
+        unsigned int First = CoreId*ChunkCell, Last  = Min(Wout, First+ChunkCell);
+
+        F16 WStep = ((F16) (Win-1))/(Wout-1);
+        F16 HStep = ((F16) (Hin-1))/(Hout-1);
+
+        unsigned int x, y;
+        F16 hCoeff = HStep*FirstLineIndex;
+        F16 BaseY = hCoeff;
+        for (y = 0 ; y < HTileOut ; y++) {
+                int h_rounded = (int) (hCoeff - BaseY + 0.5);
+                F16 wCoeff = First*WStep;
+                for (x = First ; x < Last ; x++) {
+                        int w_rounded = (int) (wCoeff + 0.5);
+
+                        Out[y*Wout + x] = In[h_rounded*Win + w_rounded];
+                        wCoeff += WStep;
+                }
+                hCoeff += HStep;
+        }
+        gap_waitbarrier(0);
+}
+#endif
diff --git a/tools/autotiler_v3/Generators/BilinearResizes/ResizeBasicKernels.h b/tools/autotiler_v3/Generators/BilinearResizes/ResizeBasicKernels.h
index fd3216f29..6d1406d4f 100644
--- a/tools/autotiler_v3/Generators/BilinearResizes/ResizeBasicKernels.h
+++ b/tools/autotiler_v3/Generators/BilinearResizes/ResizeBasicKernels.h
@@ -12,6 +12,11 @@
 
 #include "Gap.h"
 
+#ifdef __gap9__
+#include "CNN_Defines_fp16.h"
+#include "CNN_FloatType.h"
+#endif
+
 #ifndef Max
 	#define Max(a, b)               (((a)>(b))?(a):(b))
 #endif
@@ -74,6 +79,22 @@ typedef struct {
 	unsigned int FirstLineIndex;
 } KerResizeSigned16_ArgT;
 
+#ifdef __gap9__
+typedef struct {
+	F16 * __restrict__ In;
+	unsigned int Win;
+	unsigned int Hin;
+	F16 * __restrict__ Out;
+	unsigned int Wout;
+	unsigned int Hout;
+	unsigned int HTileOut;
+	unsigned int FirstLineIndex;
+} KerResize_fp16_ArgT;
+
+void KerResizeBilinear_fp16(KerResize_fp16_ArgT *Arg);
+void KerResizeNearestNeighbor_fp16(KerResize_fp16_ArgT *Arg);
+#endif
+
 void KerResizeBilinear(KerResizeBilinear_ArgT *Arg);
 void KerResizeNearestNeighbor(KerResizeNearestNeighbor_ArgT *Arg);
 void KerResizeBilinearSigned(KerResizeBilinearSigned_ArgT *Arg);
diff --git a/tools/autotiler_v3/Generators/BilinearResizes/ResizeGenerator.c b/tools/autotiler_v3/Generators/BilinearResizes/ResizeGenerator.c
index f74baea96..5133a2148 100644
--- a/tools/autotiler_v3/Generators/BilinearResizes/ResizeGenerator.c
+++ b/tools/autotiler_v3/Generators/BilinearResizes/ResizeGenerator.c
@@ -92,6 +92,32 @@ void LoadResizeLibrary()
 		"KerResizeSigned16_ArgT",
 		NULL
 	);
+	LibKernel("KerResizeNearestNeighbor_fp16", CALL_PARALLEL,
+		CArgs(8,
+			TCArg("F16 * __restrict__", "In"),
+			TCArg("unsigned int", "Win"),
+			TCArg("unsigned int", "Hin"),
+			TCArg("F16 * __restrict__", "Out"),
+			TCArg("unsigned int", "Wout"),
+			TCArg("unsigned int", "Hout"),
+			TCArg("unsigned int", "HTileOut"),
+			TCArg("unsigned int", "FirstLineIndex")),
+		"KerResize_fp16_ArgT",
+		NULL
+	);
+	LibKernel("KerResizeBilinear_fp16", CALL_PARALLEL,
+		CArgs(8,
+			TCArg("F16 * __restrict__", "In"),
+			TCArg("unsigned int", "Win"),
+			TCArg("unsigned int", "Hin"),
+			TCArg("F16 * __restrict__", "Out"),
+			TCArg("unsigned int", "Wout"),
+			TCArg("unsigned int", "Hout"),
+			TCArg("unsigned int", "HTileOut"),
+			TCArg("unsigned int", "FirstLineIndex")),
+		"KerResize_fp16_ArgT",
+		NULL
+	);
 }
 
 int GenerateResizeMultiChannel(char *Name, unsigned int Win, unsigned int Hin, unsigned int Wout, unsigned int Hout, unsigned int Channels, InOut_t InOut_Type, resize_kop_t Type)
@@ -186,6 +212,51 @@ int GenerateResizeMultiChannelQ16(char *Name, unsigned int Win, unsigned int Hin
 }
 
 
+int GenerateResizeMultiChannel_fp16(char *Name, unsigned int Win, unsigned int Hin, unsigned int Wout, unsigned int Hout, unsigned int Channels, InOut_t InOut_Type, resize_kop_t Type)
+
+{
+	char *ResizeKerName;
+	switch (Type){
+		case KOP_BILINEAR_RESIZE:
+			ResizeKerName = "KerResizeBilinear_fp16";
+			break;
+		case KOP_NEAREST_NEIGHBOR_RESIZE:
+			ResizeKerName = "KerResizeNearestNeighbor_fp16";
+			break;
+		default:
+			ResizeKerName = "KerResizeBilinear_fp16";
+	}
+	printf("%s\n", ResizeKerName);
+	int LayerOp = Channels * Wout * Hout * (3 + 6 + 3);
+	int LayerBandwidth = Channels * Win * Hin + Channels * Hout * Wout;
+	Kernel_T *Kernel = UserKernel(Name,
+		KernelIterSpace(2, IterFixedSpace(KER_ITER_D0, Channels), IterTiledSpace(KER_ITER_TILE0)),
+		(Hin==1)?TILE_VER:TILE_HOR,
+		CArgs(2, TCArg("F16 *", "In"), TCArg("F16 *", "Out")),
+		Calls(1, Call(ResizeKerName, LOC_LOOP,
+			Bindings(8, K_Arg("In", KER_ARG_TILE),
+				        K_Arg("In", KER_ARG_W),
+				        K_Arg("In", KER_ARG_H),
+				        K_Arg("Out", KER_ARG_TILE),
+				        K_Arg("Out", KER_ARG_W),
+				        K_Arg("Out", KER_ARG_H),
+				        K_Arg("Out", KER_ARG_TILE_H),
+				        K_Arg("In", KER_ARG_TILE_BASE)))),
+		KerArgs(2,
+			KerArg("In" , KerArgSpace(2,KER_ITER_D0,KER_ITER_TILE0), OBJ_IN_DB,  Win,  Hin,  sizeof(short), 1, OBJ_CONSTRAINTS_DYNAMIC, 0, "In"),
+			KerArg("Out", KerArgSpace(2,KER_ITER_D0,KER_ITER_TILE0), OBJ_OUT_DB, Wout, Hout, sizeof(short), 0, OBJ_CONSTRAINTS_DYNAMIC, 0, "Out")
+		)
+	);
+	if (Kernel) {
+		AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0);
+		AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0);
+
+		AddKernelFloatArgDim(Name, "In", 4, Channels, Hin, Win, 2);
+		AddKernelFloatArgDim(Name, "Out", 4, Channels, Hout, Wout, 2);
+	}
+	return (Kernel!=0);
+}
+
 void ResizeConfiguration(unsigned int L1Memory)
 
 {
diff --git a/tools/autotiler_v3/Generators/BilinearResizes/ResizeGenerator.h b/tools/autotiler_v3/Generators/BilinearResizes/ResizeGenerator.h
index 83e8b7539..901623ab0 100644
--- a/tools/autotiler_v3/Generators/BilinearResizes/ResizeGenerator.h
+++ b/tools/autotiler_v3/Generators/BilinearResizes/ResizeGenerator.h
@@ -57,6 +57,7 @@ Generate Resizing for a given set of parameters
 */
 int GenerateResizeMultiChannel(char *Name, unsigned int Win, unsigned int Hin, unsigned int Wout, unsigned int Hout, unsigned int Channels, InOut_t InOut_Type, resize_kop_t Type);
 int GenerateResizeMultiChannelQ16(char *Name, unsigned int Win, unsigned int Hin, unsigned int Wout, unsigned int Hout, unsigned int Channels, InOut_t InOut_Type, resize_kop_t Type);
+int GenerateResizeMultiChannel_fp16(char *Name, unsigned int Win, unsigned int Hin, unsigned int Wout, unsigned int Hout, unsigned int Channels, InOut_t InOut_Type, resize_kop_t Type);
 
 #define GenerateResizeNew(Name, Win, Hin, Wout, Hout, Type) GenerateResizeMultiChannel(Name, Win, Hin, Wout, Hout, 1, UNSIGNED_INOUT, Type)
 #define GenerateResize(Name, Win, Hin, Wout, Hout)  GenerateResizeNew(Name, Win, Hin, Wout, Hout, KOP_BILINEAR_RESIZE)
diff --git a/tools/autotiler_v3/Makefile b/tools/autotiler_v3/Makefile
index 25a7ce5dd..c6f308357 100644
--- a/tools/autotiler_v3/Makefile
+++ b/tools/autotiler_v3/Makefile
@@ -1,4 +1,4 @@
-TILER_VER=4.2.0
+TILER_VER=4.1.0
 export TILER_LIB=libtile.${TILER_VER}.a
 ifdef GAP_SDK_HOME
 export TILER_URL=$(GAP_SDK_HOME)/.tiler_url
diff --git a/tools/autotiler_v3/version.cfg b/tools/autotiler_v3/version.cfg
index 22bf7a2cf..dbce13460 100644
--- a/tools/autotiler_v3/version.cfg
+++ b/tools/autotiler_v3/version.cfg
@@ -3,7 +3,7 @@
         {
             "version": "autotiler-v3",
             "magicNum": 718930176,
-            "git-hash": "29855c78d247eb5b3b3b0758b10dfc08306d06c9"
+            "git-hash": "bd6fd2447d2ccd4dcdd8c774a3a130b3141fac22"
         }
     ]
 }
\ No newline at end of file
diff --git a/tools/jenkins/gap_sdk_version.txt b/tools/jenkins/gap_sdk_version.txt
index 642d57e4b..1f8776c61 100644
--- a/tools/jenkins/gap_sdk_version.txt
+++ b/tools/jenkins/gap_sdk_version.txt
@@ -1 +1 @@
-c6603c58b94f3ac80ade4370c2b3c4e558f246f2
+0addfcd309a48d5a5dbd3e058de0cf1ffc660a2b
diff --git a/tools/nntool/.gitignore b/tools/nntool/.gitignore
index bd00d96a3..45c264225 100644
--- a/tools/nntool/.gitignore
+++ b/tools/nntool/.gitignore
@@ -16,3 +16,6 @@ importer/tflite
 .env
 gsync
 tempdir/
+notebooks/
+utils_nocheckin/
+graphs_nocheckin/
diff --git a/tools/nntool/generation/code_block.py b/tools/nntool/generation/code_block.py
index b072c79bd..e615c6844 100644
--- a/tools/nntool/generation/code_block.py
+++ b/tools/nntool/generation/code_block.py
@@ -62,7 +62,11 @@ def write_start(self, fmt, *args):
 
     def comment(self, fmt, *args):
         fmt = self.get_indent() + '// ' + fmt
-        self._lines.append(fmt.format(*args))
+        if args:
+            self._lines.append(fmt.format(*args))
+        else:
+            self._lines.append(fmt)
+
         return self
 
     def __str__(self):
diff --git a/tools/nntool/generation/generators/kernels/general/resizer_kernel_generator.py b/tools/nntool/generation/generators/kernels/general/resizer_kernel_generator.py
index c22bb788f..439282e95 100644
--- a/tools/nntool/generation/generators/kernels/general/resizer_kernel_generator.py
+++ b/tools/nntool/generation/generators/kernels/general/resizer_kernel_generator.py
@@ -26,7 +26,7 @@
               "nearest_neighbor": "KOP_NEAREST_NEIGHBOR_RESIZE"}
 
 
-def gen_at_resizer(code_block, name, in_dim, new_shape, inout_t, resize_kop, q16mode):
+def gen_at_resizer(code_block, name, in_dim, new_shape, inout_t, resize_kop, q16mode, fp16):
     if in_dim.has_key('w') and in_dim.has_key('h'):
         in_dim_w, in_dim_h, in_dim_c = in_dim.w, in_dim.h, in_dim.c
     else:
@@ -47,7 +47,10 @@ def gen_at_resizer(code_block, name, in_dim, new_shape, inout_t, resize_kop, q16
             # If both HW change from 1x1 to HxW this is not going to work
             LOG.warning(f"Resize Node {name} has 1x1xc input but resizes both HW dimension, could not work in autotiler")
 
-    GenKernel = "GenerateResizeMultiChannelQ16" if q16mode else "GenerateResizeMultiChannel"
+    if fp16:
+        GenKernel = "GenerateResizeMultiChannel_fp16"
+    else:
+        GenKernel = "GenerateResizeMultiChannelQ16" if q16mode else "GenerateResizeMultiChannel"
     code_block.write('{}("{}", {}, {}, {}, {}, {}, {}, {});', GenKernel,
                      name, win, hin, wout, hout, chin, inout_t, resize_kop)
 
@@ -67,7 +70,8 @@ def __init__(self, cname, params, qrec):
         self.inout_type = "SIGNED_INOUT" if qrec.in_qs[0].signed else "UNSIGNED_INOUT"
         self.type = params.op_name
         self.new_shape = params.new_shape
-        self.q16 = True if qrec.in_qs[0].dtype_bits == 16 else False
+        self.q16 = qrec.in_qs[0].dtype_bits == 16
+        self.fp16 = qrec.in_qs[0].is_floating
 
     def code(self, code_block=None):
         if code_block is None:
@@ -76,5 +80,6 @@ def code(self, code_block=None):
         code_block.comment("generator for {}", self.node_name)
 
         gen_at_resizer(code_block, self.cname, self.in_dim,
-                       self.new_shape, self.inout_type, RESIZE_KOP[self.type], self.q16)
+                       self.new_shape, self.inout_type, RESIZE_KOP[self.type],
+                       self.q16, self.fp16)
         return code_block
diff --git a/tools/nntool/generation/generators/kernels/pow2/conv_pool_relu_kernels_generator.py b/tools/nntool/generation/generators/kernels/pow2/conv_pool_relu_kernels_generator.py
index e52a9064d..6ef573390 100644
--- a/tools/nntool/generation/generators/kernels/pow2/conv_pool_relu_kernels_generator.py
+++ b/tools/nntool/generation/generators/kernels/pow2/conv_pool_relu_kernels_generator.py
@@ -236,7 +236,7 @@ def __init__(self, node_name, cname, conv_params, conv_q,
                     "only homogenious operations are supported at present")
             LOG.debug("%s: pool relu inq %s outq %s control block",
                       node_name, in_q, out_q)
-            if at_pool_params.PoolOper == 'KOP_NONE' and not in_dim.is_named and in_dim.has_keys(['c', 'w', 'h']):
+            if at_pool_params.PoolOper == 'KOP_NONE' and (not in_dim.is_named or not in_dim.has_keys(['c', 'w', 'h'])):
                 in_shape = in_dim.shape + ([1] * (3 - len(in_dim.shape)))
                 in_c, in_h, in_w = in_shape[0], in_shape[1], in_shape[2]
             else:
diff --git a/tools/nntool/generation/new_generators/general/dsp_generators.py b/tools/nntool/generation/new_generators/general/dsp_generators.py
index f500cb61a..3186e8073 100644
--- a/tools/nntool/generation/new_generators/general/dsp_generators.py
+++ b/tools/nntool/generation/new_generators/general/dsp_generators.py
@@ -169,7 +169,7 @@ def __init__(self, cname, params, qrec, gen_ctrl=None):
             'frame_stride': params.frame_step,
             'n_fft': params.n_fft,
             'n_melbanks': params.n_fbanks,
-            'size_mel_coeff': params.get_melfilter_size()[1],
+            'size_mel_coeff': params.get_melfilter_size()[0],
             'n_dct': params.n_dct,
             'preemp_factor': params.preemp_factor,
             'no_window': int(params.win_fn is None),
diff --git a/tools/nntool/generation/new_generators/general/quantize_parameters.py b/tools/nntool/generation/new_generators/general/quantize_parameters.py
index 8ed6b6134..d24c7c37d 100644
--- a/tools/nntool/generation/new_generators/general/quantize_parameters.py
+++ b/tools/nntool/generation/new_generators/general/quantize_parameters.py
@@ -52,14 +52,12 @@ def cache_values(cls, node, qrec):
         qrec.cache['float_conversion'] = float_conversion = in_q.is_floating or out_q.is_floating
         qrec.cache['bit_conversion'] = bit_conversion = in_q.bits != out_q.bits
         if float_conversion:
-            if in_q.dtype in [bfloat16, np.float16]:
+            if in_q.is_floating:
                 qrec.cache['kernel_type'] = 'KOP_CONVERT_FL_FP'
-                qrec.cache['in_at_size'] = 2
-                qrec.cache['out_at_size'] = at_bits(out_q)
             else:
                 qrec.cache['kernel_type'] = 'KOP_CONVERT_FP_FL'
-                qrec.cache['in_at_size'] = at_bits(in_q)
-                qrec.cache['out_at_size'] = 2
+            qrec.cache['in_at_size'] = at_bits(in_q)
+            qrec.cache['out_at_size'] = at_bits(out_q)
             return True
         else:
             qrec.cache['in_at_size'] = at_bits(in_q)
@@ -107,7 +105,7 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
             else:
                 offset = (int(math.pow(2, bits)) - in_q.zero_point[0] +
                           out_q.zero_point[0]).astype(out_q.dtype)
-            contents = np.array(list(offset.tobytes()) + ([0] * 6), dtype=np.uint8)
+            contents = np.array(list(offset.tobytes()) + ([0] * 7), dtype=np.uint8)
         elif qrec.cache['kernel_type'] == 'KOP_CONVERT_FP_FP':
             # no infos needed
             return True
@@ -133,15 +131,15 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
             qbias = list(scale_adjust.qbiases.tobytes())
             qbias = qbias + [0] * (2 - len(qbias))
             qnorm = list(scale_adjust.qnorms.tobytes())
-            contents = np.array(zero_adjust + qbias + qnorm, dtype=np.int8)
+            contents = np.array(zero_adjust + qbias + qnorm + [0], dtype=np.int8)
         elif qrec.cache['kernel_type'] == 'KOP_CONVERT_FL_FP':
-            qbias = list((1/out_q.scale).astype(in_q.dtype).tobytes())
-            zero_adjust = list((out_q.zero_point.astype(np.int32) * out_q.scale).astype(in_q.dtype).tobytes())
-            contents = np.array(zero_adjust + [0, 0] + qbias + [0], dtype=np.int8)
+            qbias = list((1/out_q.scale).astype(np.float32).tobytes())
+            zero_adjust = list((out_q.zero_point.astype(np.int32) * out_q.scale).astype(np.float32).tobytes())
+            contents = np.array(zero_adjust + qbias, dtype=np.int8)
         elif qrec.cache['kernel_type'] == 'KOP_CONVERT_FP_FL':
-            qbias = list((in_q.scale).astype(out_q.dtype).tobytes())
-            zero_adjust = list((-in_q.zero_point.astype(np.int32)).astype(out_q.dtype).tobytes())
-            contents = np.array(zero_adjust + [0, 0] + qbias + [0], dtype=np.int8)
+            qbias = list((in_q.scale).astype(np.float32).tobytes())
+            zero_adjust = list((-in_q.zero_point.astype(np.int32)).astype(np.float32).tobytes())
+            contents = np.array(zero_adjust + qbias, dtype=np.int8)
         else:
             raise ValueError(f"strange dtype change in {pnode.name}")
         cname, file_name = gen_constant(gen, pnode, pnode, INFOS)
diff --git a/tools/nntool/generation/new_generators/mult8/matmul_mult8.py b/tools/nntool/generation/new_generators/mult8/matmul_mult8.py
index 02bbaae2f..c09c847db 100644
--- a/tools/nntool/generation/new_generators/mult8/matmul_mult8.py
+++ b/tools/nntool/generation/new_generators/mult8/matmul_mult8.py
@@ -136,13 +136,15 @@ def set_matmul_bindings(gen, in_eparams, out_eparams, cname, node, node_q, out_q
             "Node {} inq {} outq {}", node.name,
             str(node_q.in_qs[0]), str(out_q.out_qs[0]))
     )
+    idx_0 = 0 if node_q.cache.get("ne16") else 1
+    idx_1 = 1 if node_q.cache.get("ne16") else 0
     if len(node.in_dims) == 3:
         if len(node_q.in_qs[1].scale) > 1:
             gen.bindings.append(
                 NodeBindingList(
                     cname,
-                    GNodeArgEdge(in_eparams[1]),
-                    GNodeArgEdge(in_eparams[0]),
+                    GNodeArgEdge(in_eparams[idx_0]),
+                    GNodeArgEdge(in_eparams[idx_1]),
                     GNodeArgEdge(in_eparams[2]),
                     GNodeArgEdge(out_eparams[0], "GNA_OUT"),
                     GNodeArgNode(node, MULSCALE),
@@ -152,8 +154,8 @@ def set_matmul_bindings(gen, in_eparams, out_eparams, cname, node, node_q, out_q
             gen.bindings.append(
                 NodeBindingList(
                     cname,
-                    GNodeArgEdge(in_eparams[1]),
-                    GNodeArgEdge(in_eparams[0]),
+                    GNodeArgEdge(in_eparams[idx_0]),
+                    GNodeArgEdge(in_eparams[idx_1]),
                     GNodeArgEdge(in_eparams[2]),
                     GNodeArgEdge(out_eparams[0], "GNA_OUT"),
                     GNodeArgNode(node, INFOS)))
@@ -162,8 +164,8 @@ def set_matmul_bindings(gen, in_eparams, out_eparams, cname, node, node_q, out_q
             gen.bindings.append(
                 NodeBindingList(
                     cname,
-                    GNodeArgEdge(in_eparams[1]),
-                    GNodeArgEdge(in_eparams[0]),
+                    GNodeArgEdge(in_eparams[idx_0]),
+                    GNodeArgEdge(in_eparams[idx_1]),
                     GNodeArgEdge(out_eparams[0], "GNA_OUT"),
                     GNodeArgNode(node, MULSCALE),
                     GNodeArgNode(node, MULSHIFT),
@@ -172,8 +174,8 @@ def set_matmul_bindings(gen, in_eparams, out_eparams, cname, node, node_q, out_q
             gen.bindings.append(
                 NodeBindingList(
                     cname,
-                    GNodeArgEdge(in_eparams[1]),
-                    GNodeArgEdge(in_eparams[0]),
+                    GNodeArgEdge(in_eparams[idx_0]),
+                    GNodeArgEdge(in_eparams[idx_1]),
                     GNodeArgEdge(out_eparams[0], "GNA_OUT"),
                     GNodeArgNode(node, INFOS)))
 
diff --git a/tools/nntool/graph/dim.py b/tools/nntool/graph/dim.py
index d2962636f..c1a2fd5e4 100644
--- a/tools/nntool/graph/dim.py
+++ b/tools/nntool/graph/dim.py
@@ -64,6 +64,8 @@ class MoreThanOneInputError(DimError):
 class Dim():
     def __init__(self, shape=None, names=None, is_ordered=False, is_unknown=False):
         set_shape = shape if shape is not None else [] if names is None else [None] * len(names)
+        if any(dim is not None and dim < 0 for dim in set_shape):
+            raise ValueError('invalid dim')
         super().__setattr__('_shape', set_shape)
         super().__setattr__('_names', names)
         super().__setattr__('_is_ordered', is_ordered)
@@ -486,6 +488,8 @@ def _do_operation(self, other, op) -> 'Dim':
             res = self.clone()
             for k in self.keys:
                 setattr(res, k, op(getattr(res, k), other))
+            if any(dim is not None and dim < 0 for dim in self.shape):
+                raise ValueError('invalid dim')
             return res
 
         if isinstance(other, Dim):
@@ -500,6 +504,8 @@ def _do_operation(self, other, op) -> 'Dim':
                         setattr(res, k, getattr(other, k))
                     elif getattr(other, k) is not None:
                         setattr(res, k, op(getattr(self, k), getattr(other, k)))
+            if any(dim is not None and dim < 0 for dim in self.shape):
+                raise ValueError('invalid dim')
             return res
 
         raise TypeError("Inapropriate types for operation")
diff --git a/tools/nntool/graph/manipulations/adjust_order.py b/tools/nntool/graph/manipulations/adjust_order.py
index eba46fffa..4067c3397 100644
--- a/tools/nntool/graph/manipulations/adjust_order.py
+++ b/tools/nntool/graph/manipulations/adjust_order.py
@@ -26,7 +26,7 @@
 LOG = logging.getLogger("nntool." + __name__)
 
 
-def adjust_order(G, reshape_weights=True, postprocess=True, debug_function=None, one_cycle=False):
+def adjust_order(G, reshape_weights=True, postprocess=True, debug_function=None, one_cycle=False, single_step=False):
     opts = {'reshape_weights': reshape_weights}
     selector = AdjusterBase.get_all_handlers(opts)
     LOG.info("adding transposes to correct tensor order for AT kernels")
@@ -55,5 +55,5 @@ def adjust_order(G, reshape_weights=True, postprocess=True, debug_function=None,
     if debug_function:
         debug_function(G)
     if postprocess:
-        eliminate_transposes(G, debug_function=debug_function, one_cycle=one_cycle)
+        eliminate_transposes(G, debug_function=debug_function, one_cycle=one_cycle, single_step=single_step)
         add_dimensions(G)
diff --git a/tools/nntool/graph/manipulations/dimensions.py b/tools/nntool/graph/manipulations/dimensions.py
index 6ebaf0b71..f883d4d02 100644
--- a/tools/nntool/graph/manipulations/dimensions.py
+++ b/tools/nntool/graph/manipulations/dimensions.py
@@ -33,7 +33,7 @@ def set_out_edges_multi(G, node: Parameters, dims: Sequence[Dim], step_idx: int,
                         naming_convension: NamingConvension, edge_type: str = "in_out"):
     # clone the dims first so that the edge dims are the same objects as the node output dims
     dims = node.set_output_size(dims)
-    out_edges = G.indexed_out_edges(node.name)
+    out_edges = G.indexed_out_edges(node)
     is_multi_out = len(out_edges) > 1
     for edge_idx, edge_group in enumerate(out_edges):
         if not edge_group:
diff --git a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py
index f77cfcf64..3be578e8d 100644
--- a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py
+++ b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py
@@ -13,6 +13,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+from functools import reduce
 import logging
 from collections.abc import MutableSet
 from copy import deepcopy
@@ -24,8 +25,9 @@
                          InputParameters, LinearFusionParameters,
                          OutputParameters, PadParameters, ReshapeParameters,
                          ReverseParameters, StridedSliceParameters,
-                         TransposeParameters)
+                         TransposeParameters, ActivationParameters)
 from graph.types.base import NNEdge, SensitiveToOrder
+from graph.types.others import CopyParameters, UnaryOpParameters
 from graph.types.tensor_arithmetic import Broadcastable
 from utils.compatible_transposes import (find_all_compatible_transposes,
                                          find_combination)
@@ -58,6 +60,10 @@ def info(msg):
     LOG.info(msg)
 
 
+def debug(msg):
+    LOG.debug(msg)
+
+
 TRANSIENT_ACTIONS = {
     PadParameters: TransposePad,
     ReverseParameters: TransposeReverse,
@@ -76,31 +82,23 @@ class CantContinueError(Exception):
 
 
 class TransposeHistory():
-    def __init__(self, node, from_shape, from_transpose, to_shape=None, to_transpose=None) -> None:
-        # can be initialized with to_shape set to none in which case from == to
-        # to_transpose == None means that the transpose has been eliminated by a reshape
+    def __init__(self, node, from_shape=None, transpose=None, to_shape=None) -> None:
         self.node = node
-        self._from = (from_shape, from_transpose)
-        if to_shape is None:
-            self._to = (from_shape, from_transpose)
-        else:
-            self._to = (to_shape, to_transpose)
+        self._from = from_shape
+        self._transpose = transpose
+        self._to = to_shape
 
     @property
     def from_shape(self):
-        return self._from[0]
+        return self._from
 
     @property
     def to_shape(self):
-        return self._to[0]
+        return self._to
 
     @property
-    def from_transpose(self):
-        return self._from[1]
-
-    @property
-    def to_transpose(self):
-        return self._to[1]
+    def transpose(self):
+        return self._transpose
 
 
 class VisitedNodes(MutableSet):
@@ -215,9 +213,26 @@ def none_or_idx(trans, idx):
     return None if trans[idx] is None else idx
 
 
+def reverse_broadcast(old_shape, new_shape, transpose):
+    old_shape_idx = new_shape_idx = 0
+    res_pos = {}
+    while old_shape_idx < len(old_shape) or new_shape_idx < len(new_shape):
+        if old_shape_idx < len(old_shape) and old_shape[old_shape_idx] == new_shape[new_shape_idx]:
+            res_pos[old_shape_idx] = new_shape_idx
+            old_shape_idx += 1
+            new_shape_idx += 1
+        elif new_shape_idx < len(new_shape) and new_shape[new_shape_idx] == 1:
+            new_shape_idx += 1
+        else:
+            raise ValueError(
+                f'reverse broadcast not possible between {old_shape} and {new_shape}')
+    return tuple([res_pos[idx] for idx in transpose] + [idx for idx, _ in enumerate(new_shape) if idx not in res_pos.values()])
+
+
 def requires_reshape(trans1, trans2, dim):
     """Checks if layout shape doesn't change but a reshape is necessary due to 1 position"""
-    if tuple(dim.shape) != tuple(dim.layout_shape) and dim.layout_shape == dim.calc_transpose(trans1).calc_transpose(trans2).layout_shape:
+    if (tuple(dim.shape) != tuple(dim.layout_shape) and
+            dim.layout_shape == dim.calc_transpose(trans1).calc_transpose(trans2).layout_shape):
         from_shape = dim.calc_transpose(trans1)
         to_shape = dim.calc_transpose(trans2)
         if from_shape.shape != to_shape.shape:
@@ -302,7 +317,26 @@ def compute_max_shape(dims):
     return [max(dims) for dims in zip(*shapes)]
 
 
-def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, transpose_history: Sequence[TransposeHistory]):
+def broadcasted_axes(shape, full_shape):
+    return tuple(range(len(full_shape) - len(shape)))
+
+
+def reduce_dimension(axis, stripped_axes):
+    """reduces axis by the number of stripped axes that are less than it"""
+    return axis - sum(1 if stripped_axis < axis else 0 for stripped_axis in stripped_axes)
+
+
+def expand_axes_in_transpose(transpose, num_new_axes):
+    """increases axis by the number of new axes"""
+    return tuple(list(range(num_new_axes)) + [axis + num_new_axes for axis in transpose])
+
+
+def strip_axes_from_transpose(transpose, stripped_axes):
+    return tuple(reduce_dimension(axis, stripped_axes) for axis in transpose if axis not in stripped_axes)
+
+
+def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge,
+                transpose_history: Sequence[TransposeHistory]):
     """Searches down the graph for something that eliminates transpose
 
     Args:
@@ -311,7 +345,8 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, tr
         visited_nodes : Nodes already traversed
         in_edge : The edge we are arriving on at this node
         transpose_history : A history of the reshapes passed that did not allow us to determine the transpose
-        transpose : The current transpose being propagated. Can be None to indicate that we cannot translate the transpose via that reshape
+        transpose : The current transpose being propagated. Can be None to indicate that we cannot translate
+                    the transpose via that reshape
 
     Returns:
         A tuple of a list of actions and a list of nodes traversed
@@ -319,15 +354,16 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, tr
     cur_visited_nodes = VisitedNodes()
     cur_visited_nodes.visit_down(node, in_edge.to_idx)
 
-    transpose = transpose_history[-1].to_transpose
+    transpose = transpose_history[-1].transpose
     in_shape = node.in_dims[in_edge.to_idx].shape
+    debug(f'down at {node.name} trans {transpose} shape {in_shape}')
     if transpose is not None and len(transpose) == 1:
         return [EndActionDown(node)], []
 
-    if isinstance(node, SensitiveToOrder) and transpose_does_nothing(transpose, in_shape):
+    if isinstance(node, SensitiveToOrder) and transpose_does_nothing(reverse_transpose(transpose), in_shape):
         check_for_null_transpose(node, transpose)
         new_shape = apply_transpose(
-            in_shape, transpose)
+            in_shape, reverse_transpose(transpose))
         # could be that the transpose does nothing to the data layout but still changes the positions of
         # the ones in the shape
         if new_shape == in_shape:
@@ -343,30 +379,30 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, tr
         check_for_null_transpose(node, transpose)
         info(
             f'rejected {node.name}  - sensitive to order - inserting transpose {transpose}')
-        return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose), EndActionDown(node)], cur_visited_nodes
+        return ([InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose),
+                 EndActionDown(node)], cur_visited_nodes)
 
     cur_actions = []
 
-    # if arriiving on a broadcasted input the transpose needs to be expanded and a reshape inserted
-    # TODO - Expand to handle multiple outputs on expressions
-    if isinstance(node, Broadcastable) and node.in_dims[in_edge.to_idx].rank != node.out_dims[0].rank:
+    # if arriving on a broadcasted input the transpose needs to be expanded
+    # since the transpose is only acting on the broadcasted dimensions no reshape is necessary
+    if isinstance(node, Broadcastable) and len(in_shape) != node.out_dims[0].rank:
         check_for_null_transpose(node, transpose)
+        # This could be an expression so need to broadcaset the output
         max_shape = compute_max_shape(node.out_dims)
-        in_shape, new_shape, new_transpose = broadcast_expand(
-            max_shape, in_shape, transpose)
-        from_shape = apply_transpose(in_shape, reverse_transpose(transpose))
-        to_shape = apply_transpose(new_shape, reverse_transpose(new_transpose))
-        info(f'{node.name} broadcasted input {in_edge.to_idx} requires reshape {from_shape}->{to_shape}')
-        cur_actions.append(
-            InsertReshapeAction(
-                node, direction='in', idx=in_edge.to_idx,
-                in_shape=from_shape,
-                out_shape=to_shape))
+        b_axes = broadcasted_axes(in_shape, max_shape)
+
+        new_transpose = expand_axes_in_transpose(transpose, len(b_axes))
+        new_shape = tuple(([1] * len(b_axes)) + in_shape)
+        transpose_history += [
+            TransposeHistory(node, in_shape,
+                             new_transpose,
+                             new_shape)
+        ]
         transpose = new_transpose
         in_shape = new_shape
 
     # on nodes where inputs must have the same transpose applied i.e. Binary arithmetic ops etc
-    # TODO - Expand to handle multiple outputs on expressions
     if any(isinstance(node, cls) for cls in NODES_TO_EXPLORE_UP):
         check_for_null_transpose(node, transpose)
         max_shape = compute_max_shape(node.out_dims)
@@ -374,24 +410,39 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, tr
             if check_continue(visited_nodes, cur_visited_nodes, exclude_nodes, edge.from_node, 'up', edge.from_idx):
                 continue
             # if other edges are broadcasted they may need to be reshaped
-            edge_in_shape, new_shape, new_transpose = broadcast_reduce(
-                max_shape, node.in_dims[edge.to_idx].shape, transpose)
+            edge_in_shape = node.in_dims[edge.to_idx].shape
+            # different rank so broadcasted
+            if len(edge_in_shape) != len(max_shape):
+                # strip the broadcasted axis from the transpose
+                b_axes = broadcasted_axes(edge_in_shape, max_shape)
+                transpose_without_broadcast = strip_axes_from_transpose(
+                    reverse_transpose(transpose), b_axes)
+                # from shape will be the old shape with the unbroadcasted transpose
+                from_shape = apply_transpose(
+                    edge_in_shape, transpose_without_broadcast)
+                # to shape is the broadcasted input shape with the transpose with the leading ones removed
+                broadcasted_shape = ([1] * len(b_axes)) + list(edge_in_shape)
+                to_shape = strip_leading_ones(apply_transpose(
+                    broadcasted_shape, reverse_transpose(transpose)), len(from_shape))
+                # if they are not equal insert a reshape
+                if from_shape != to_shape:
+                    info(
+                        f'{node.name} broadcasted input {edge.to_idx} requires reshape {from_shape}->{to_shape}')
+                    cur_actions.append(
+                        InsertReshapeAction(
+                            node, direction='in', idx=edge.to_idx,
+                            in_shape=from_shape,
+                            out_shape=to_shape
+                        ))
+                new_transpose = transpose_without_broadcast
+            else:
+                new_transpose = transpose
+
             new_history = [
-                TransposeHistory(node, apply_transpose(
-                    node.in_dims[edge.to_idx].shape, new_transpose), reverse_transpose(new_transpose))
+                TransposeHistory(node, edge_in_shape,
+                                 new_transpose, edge_in_shape)
             ]
-            from_shape = apply_transpose(
-                edge_in_shape, reverse_transpose(new_transpose))
-            to_shape = apply_transpose(new_shape, reverse_transpose(transpose))
-            if from_shape != to_shape:
-                info(
-                    f'{node.name} broadcasted input {edge.to_idx} requires reshape {from_shape}->{to_shape}')
-                cur_actions.append(
-                    InsertReshapeAction(
-                        node, direction='in', idx=edge.to_idx,
-                        in_shape=from_shape,
-                        out_shape=strip_leading_ones(to_shape, len(from_shape))
-                    ))
+
             new_actions, visited_up_nodes = search_up(
                 G, edge.from_node, exclude_nodes, visited_nodes | cur_visited_nodes, edge, new_history)
             cur_visited_nodes |= visited_up_nodes
@@ -457,11 +508,12 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, tr
         if identity_transpose(new_transpose):
             return cur_actions + [EndActionDown(node)], cur_visited_nodes
 
-        from_shape = do_transpose(reverse_transpose(transpose), node.slice_shape) if transpose is not None else None
+        from_shape = do_transpose(reverse_transpose(
+            transpose), node.slice_shape) if transpose is not None else None
 
         transpose_history = transpose_history + \
             [TransposeHistory(node, node.slice_shape,
-                                transpose, from_shape, new_transpose)]
+                              new_transpose, node.out_shape)]
         transpose = new_transpose
     elif node.__class__ in TRANSIENT_ACTIONS:
         check_for_null_transpose(node, transpose)
@@ -475,26 +527,46 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, tr
             # the order wrong in this case
             old_transpose = get_reshape_transpose(
                 node.old_shape.shape, node.shape.shape)
-            new_transpose = apply_transpose(transpose, old_transpose)
+            if reverses_transpose(transpose, old_transpose):
+                cur_actions += [
+                    DeleteReshapeAction(
+                        node
+                    )
+                ]
+                return cur_actions + [
+                    DeleteReshapeAction(
+                        node
+                    ),
+                    EndActionDown(node)], cur_visited_nodes
+            new_transpose = apply_transpose(
+                transpose, old_transpose)
+            info(
+                f"pass reshape that is transpose {node.name} down trans: old {transpose} new {new_transpose} shape: old {node.old_shape} new {node.shape}")
+            # insert an action to rewrite the reshape shapes
+            from_shape = apply_transpose(
+                node.old_shape.shape, reverse_transpose(transpose))
+            to_shape = apply_transpose(
+                node.shape.shape, reverse_transpose(transpose))
         else:
             # the transpose that we are actually applying is the reverse of the transpose that we are propagating down
             # So we reverse the transpose before evaluating the reshape and then reverse the result
             new_transpose = reverse_transpose(reverse_reshape(
                 reverse_transpose(transpose), node.old_shape, node.shape))
-
-        info(
-            f"pass reshape {node.name} down trans: old {transpose} new {new_transpose} shape: old {node.old_shape} new {node.shape}")
-        if new_transpose is None and len(node.shape) > 1:
             info(
-                f"rejected {node.name} - transpose out - does not reverse - inserting transpose {transpose}")
-            return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose), EndActionDown(node)], cur_visited_nodes
-        # insert an action to rewrite the reshape shapes
-        from_shape = node.old_shape.calc_transpose(
-            reverse_transpose(transpose)) if transpose is not None else None
-        to_shape = node.shape.calc_transpose(reverse_transpose(
-            new_transpose)) if new_transpose is not None else None
+                f"pass reshape {node.name} down trans: old {transpose} new {new_transpose} shape: old {node.old_shape} new {node.shape}")
+
+            if new_transpose is None and len(node.shape) > 1:
+                info(
+                    f"rejected {node.name} - transpose out - does not reverse - inserting transpose {transpose}")
+                return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose), EndActionDown(node)], cur_visited_nodes
+
+            # insert an action to rewrite the reshape shapes
+            from_shape = apply_transpose(node.old_shape.shape,
+                                         reverse_transpose(transpose)) if transpose is not None else None
+            to_shape = apply_transpose(node.shape.shape, reverse_transpose(
+                new_transpose)) if new_transpose is not None else None
         info(f"rewrite reshape to {from_shape}->{to_shape}")
-        if from_shape is None or to_shape is None or from_shape.shape != to_shape.shape:
+        if from_shape is None or to_shape is None or from_shape != to_shape:
             cur_actions += [
                 SetReshapeAction(
                     node,
@@ -514,7 +586,7 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, tr
 
         transpose_history = transpose_history + \
             [TransposeHistory(node, node.old_shape.shape,
-                              transpose, from_shape, new_transpose)]
+                              new_transpose, node.shape.shape)]
 
         if new_transpose is None:
             try:
@@ -542,19 +614,27 @@ def continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_
 
 
 def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history):
-    transpose = transpose_history[-1].to_transpose
+    transpose = transpose_history[-1].transpose
+    debug(f'up at {node.name} trans {transpose}')
     cur_visited_nodes = VisitedNodes()
     cur_visited_nodes.visit_up(node, out_edge.from_idx)
     if transpose is not None and len(transpose) == 1:
+        info(
+            f'accepted {node.name} - single dimension transpose')
         return [EndActionUp(node)], cur_visited_nodes
-    if isinstance(node, SensitiveToOrder) and transpose_does_nothing(transpose, node.out_dims[out_edge.from_idx].shape):
+    if isinstance(node, SensitiveToOrder) and transpose_does_nothing(reverse_transpose(transpose), node.out_dims[out_edge.from_idx].shape):
         new_shape = apply_transpose(
-            node.out_dims[out_edge.from_idx].shape, transpose)
+            node.out_dims[out_edge.from_idx].shape, reverse_transpose(transpose))
         # could be that the transpose does nothing to the data layout but still changes the positions of
         # the ones in the shape
         if new_shape == node.out_dims[out_edge.from_idx].shape:
+            info(
+                f'accepted {node.name} - transpose does nothing')
             return [EndActionUp(node)], cur_visited_nodes
 
+        info(
+            f'accepted {node.name} - transpose does nothing with reshape '
+            f'{node.out_dims[out_edge.from_idx].shape} -> {new_shape}')
         return [
             InsertReshapeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge,
                                 in_shape=node.out_dims[out_edge.from_idx].shape, out_shape=new_shape),
@@ -564,7 +644,7 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
         check_for_null_transpose(node, transpose)
         info(
             f'rejected {node.name}  - sensitive to order - inserting transpose {transpose}')
-        return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=transpose), EndActionUp(node)], cur_visited_nodes
+        return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes
 
     cur_actions = []
 
@@ -585,7 +665,7 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
             exclude_nodes,
             visited_nodes | cur_visited_nodes,
             edge,
-            [TransposeHistory(node, apply_transpose(node.out_dims[edge.from_idx], transpose), reverse_transpose(transpose))])
+            [TransposeHistory(node, node.out_dims[edge.from_idx], transpose, apply_transpose(node.out_dims[edge.from_idx], transpose))])
         cur_visited_nodes |= visited_down_nodes
         cur_actions += new_actions
 
@@ -610,9 +690,9 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
     if isinstance(node, TransposeParameters):
         check_for_null_transpose(node, transpose)
         # TODO - in_dims or out_dims - 99% sure in_dims
-        if reverses_transpose(node.transpose, transpose, node.in_dims[0]):
+        if tuple(node.transpose) == tuple(transpose):
             info(
-                f"accepted {node.name} - transpose {node.transpose} reversed out by {transpose} on {node.in_dims[0]}")
+                f"accepted {node.name} - transpose {node.transpose} equals {transpose} on {node.in_dims[0]}")
             reshape = requires_reshape(
                 node.transpose, transpose, node.out_dims[0])
             if reshape:
@@ -629,32 +709,46 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
         check_for_null_transpose(node, transpose)
         if node.fixed_order:
             info(f"rejected {node.name} - fixed order input")
-            return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes
+            return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=transpose), EndActionUp(node)], cur_visited_nodes
 
         info(
-            f"accepted {node.name} - input without fixed order - transpose input {transpose}")
-        return cur_actions + [ReorderInputDims.from_history(node, transpose_history, transpose=transpose), EndActionUp(node)], cur_visited_nodes
+            f"accepted {node.name} - input without fixed order - transpose input {reverse_transpose(transpose)}")
+        return cur_actions + [ReorderInputDims.from_history(node, transpose_history, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes
 
     # Constant can be reordered
     if isinstance(node, ConstantInputParameters):
         check_for_null_transpose(node, transpose)
         info(
             f"accepted {node.name} - constant input - transpose constant {transpose}")
-        return cur_actions + [ReorderConstantInput.from_history(node, transpose_history, transpose=transpose), EndActionUp(node)], cur_visited_nodes
+        return cur_actions + [ReorderConstantInput.from_history(node, transpose_history, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes
 
     # Conditions that can pass through the Transpose
-    if node.__class__ in TRANSIENT_ACTIONS:
+    if isinstance(node, StridedSliceParameters) and node.changes_shape:
+        reversed_below = reverse_transpose(transpose)
+        reversed_above = reverse_broadcast(
+            node.out_shape, node.post_slice_shape, reversed_below)
+        new_transpose = reverse_transpose(reversed_above)
+        transpose_history = transpose_history + \
+            [TransposeHistory(node, node.out_shape,
+                              new_transpose, node.post_slice_shape)]
         cur_actions.append(
-            TRANSIENT_ACTIONS[node.__class__](node, transpose, "up"))
+            TransposeSlidedSlice(node, reversed_above, "up", transpose))
+        transpose = new_transpose
+    elif node.__class__ in TRANSIENT_ACTIONS:
+        check_for_null_transpose(node, transpose)
+
+        cur_actions.append(
+            TRANSIENT_ACTIONS[node.__class__](node, reverse_transpose(transpose), "up"))
 
     elif isinstance(node, ReshapeParameters):
         check_for_null_transpose(node, transpose)  # TODO - may eliminate
-        new_transpose = reverse_reshape(transpose, node.shape, node.old_shape)
+        new_transpose = reverse_reshape(reverse_transpose(
+            transpose), node.shape, node.old_shape)
         # if the upwards shape has one dimension we keep going since we want to find
         # nodes such as a linear layer that can reorder their output filters
         # This could be extended to recurrent layers for the inner dimension
         info(
-            f"pass reshape {node.name} up trans: old {transpose} new {new_transpose} shape: old {node.old_shape} new {node.shape}")
+            f"pass reshape {node.name} up trans: old {transpose} new {new_transpose} shape: {node.old_shape} -> {node.shape}")
         if new_transpose is None and len(node.old_shape) > 1:
             info(f"rejected {node.name} - transpose in - does not reverse")
             return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes
@@ -663,10 +757,9 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
         from_shape = node.old_shape.calc_transpose(
             new_transpose) if new_transpose is not None else None
         to_shape = node.shape.calc_transpose(
-            transpose) if transpose is not None else None
+            reverse_transpose(transpose)) if transpose is not None else None
         transpose_history = transpose_history + \
-            [TransposeHistory(node, node.shape, transpose,
-                              to_shape, new_transpose)]
+            [TransposeHistory(node, node.shape, new_transpose, node.old_shape)]
         info(f"rewrite reshape to {from_shape}->{to_shape}")
         if from_shape is None or to_shape is None or from_shape.shape != to_shape.shape:
             cur_actions.extend([
@@ -709,22 +802,37 @@ def continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_ac
 
         if check_continue(visited_nodes, cur_visited_nodes, exclude_nodes, edge.from_node, 'up', edge.from_idx):
             continue
-
-        if isinstance(node, Broadcastable) and node.in_dims[edge.to_idx].rank != node.out_dims[0].rank:
+        edge_in_shape = node.in_dims[edge.to_idx].shape
+        if isinstance(node, Broadcastable) and len(edge_in_shape) != node.out_dims[0].rank:
             max_shape = compute_max_shape(node.out_dims)
-            old_shape, shape, new_transpose = broadcast_reduce(
-                max_shape, node.in_dims[edge.to_idx].shape, transpose)
-            extra_history = [
-                TransposeHistory(
-                    node, node.in_dims[edge.to_idx].shape, transpose, shape, new_transpose)
-            ]
-            old_shape = apply_transpose(old_shape, new_transpose)
-            shape = apply_transpose(shape, transpose)
-            cur_actions.append(InsertReshapeAction(
-                node, direction='in', idx=edge.to_idx,
-                in_shape=Dim.unnamed(old_shape), out_shape=Dim.unnamed(shape)))
+            b_axes = broadcasted_axes(edge_in_shape, max_shape)
+
+            transpose_without_broadcast = strip_axes_from_transpose(
+                reverse_transpose(transpose), b_axes)
+            # from shape will be the old shape with the unbroadcasted transpose
+            from_shape = apply_transpose(
+                edge_in_shape, transpose_without_broadcast)
+            # to shape is the broadcasted input shape with the transpose with the leading ones removed
+            broadcasted_shape = ([1] * len(b_axes)) + list(edge_in_shape)
+            to_shape = strip_leading_ones(apply_transpose(
+                broadcasted_shape, reverse_transpose(transpose)), len(from_shape))
+            # if they are not equal insert a reshape
+            if from_shape != to_shape:
+                info(
+                    f'{node.name} broadcasted input {edge.to_idx} requires reshape {from_shape}->{to_shape}')
+                cur_actions.append(
+                    InsertReshapeAction(
+                        node, direction='in', idx=edge.to_idx,
+                        in_shape=from_shape,
+                        out_shape=to_shape
+                    ))
+                extra_history = [
+                    TransposeHistory(
+                        node, broadcasted_shape, transpose_without_broadcast, edge_in_shape)
+                ]
+            else:
+                extra_history = []
         else:
-            new_transpose = transpose
             extra_history = []
         new_actions, visited_up_nodes = search_up(
             G, edge.from_node, exclude_nodes, visited_nodes | cur_visited_nodes, edge,
@@ -749,17 +857,62 @@ def apply_actions(G, results: Sequence[Action]):
         action.execute(G)
 
 
+def search_edge_down(G, edge, pass_classes, stop_class):
+    node = edge.to_node
+    if isinstance(node, stop_class):
+        return node
+    elif isinstance(node, pass_classes):
+        edges = G.out_edges(node)
+        if len(edges) != 1:
+            return None
+        return search_edge_down(G, edges[0], pass_classes, stop_class)
+    else:
+        return None
+
+
+def find_sequences(G, end_node_class, middle_node_classes):
+    nodes = set(G.nodes(node_classes=end_node_class))
+    pairs = []
+    while nodes:
+        node = nodes.pop()
+        edges = G.out_edges(node)
+        if len(edges) != 1:
+            continue
+        end_node = search_edge_down(
+            G, edges[0], middle_node_classes, end_node_class)
+        if end_node is None:
+            continue
+        pairs.append((node, end_node))
+        if end_node in nodes:
+            nodes.remove(end_node)
+    return pairs
+
+
 def combine_transposes(G):
-    transpose_seqs = [trans for trans in G.nodes(node_classes=TransposeParameters)
-                      if len(G.out_edges(trans.name)) == 1 and
-                      isinstance(G.out_edges(trans.name)[0].to_node, TransposeParameters)]
-    for trans in transpose_seqs:
-        to_trans = G.out_edges(trans.name)[0].to_node
-        new_transpose = apply_transpose(trans.transpose, to_trans.transpose)
+    trans_pairs = find_sequences(
+        G,
+        TransposeParameters,
+        (CopyParameters, UnaryOpParameters, ActivationParameters))
+
+    for tstart, tend in trans_pairs:
+        new_transpose = apply_transpose(tstart.transpose, tend.transpose)
         info(
-            f'combine transposes {trans.name} and {to_trans.name} {trans.transpose} & {to_trans.transpose} -> {new_transpose}')
-        trans.transpose = new_transpose
-        G.remove_and_reconnect(to_trans, edge_class=NNEdge)
+            f'combine transposes {tstart.name} and {tend.name} {tstart.transpose} & {tend.transpose} -> {new_transpose}')
+        tstart.transpose = new_transpose
+        G.remove_and_reconnect(tend, edge_class=NNEdge)
+
+
+def combine_reshapes(G):
+    reshape_pairs = find_sequences(
+        G,
+        ReshapeParameters,
+        (CopyParameters, UnaryOpParameters, ActivationParameters))
+
+    for rstart, rend in reshape_pairs:
+        info(
+            f'combine reshapes {rstart.name} and {rend.name} {rstart.shape} & {rend.shape}')
+        rstart.shape = rend.shape
+        G.remove_and_reconnect(rend, edge_class=NNEdge)
 
 
 def remove_silly_reshapes(G):
@@ -796,19 +949,19 @@ def delete_step_idx(G, action: DeleteTransposeAction):
     return G.in_edges(action.node)[0].from_node.step_idx
 
 
-def eliminate_transposes(G, debug_function=None, one_cycle=False):
+def eliminate_transposes(G, debug_function=None, one_cycle=False, single_step=False, do_silly=True):
     info("eliminating unnecessary transposes")
     found_results = True
     pass_count = 0
     while found_results:
         pass_count += 1
-        if pass_count > 50:
+        if pass_count > (200 if single_step else 50):
             raise ValueError(
                 "Sorry, eliminate transposes is stuck in a loop. Please report to GreenWaves.")
         found_results = False
         visited_nodes = set()
         actions = []
-        info("search for transposes")
+        info(f"search for transposes +++ PASS {pass_count}")
         transposes = G.nodes(node_classes=TransposeParameters)
         while transposes:
             transpose_node = transposes.pop(0)
@@ -830,7 +983,11 @@ def eliminate_transposes(G, debug_function=None, one_cycle=False):
                     visited_nodes,
                     cur_visited_up,
                     in_edge,
-                    [TransposeHistory(transpose_node, transpose_node.in_dims[0], transpose_node.transpose)])
+                    [TransposeHistory(  # When going up transpose is reversed
+                        transpose_node,
+                        transpose_node.out_dims[0].shape,
+                        reverse_transpose(transpose_node.transpose),
+                        transpose_node.in_dims[0].shape)])
                 # cur_visited_up.append(transpose_node)
             except CantContinueError:
                 cur_actions_up = cur_visited_up = None
@@ -853,7 +1010,11 @@ def eliminate_transposes(G, debug_function=None, one_cycle=False):
                         visited_nodes,
                         cur_visited_down,
                         edge,
-                        [TransposeHistory(transpose_node, transpose_node.out_dims[0], transpose_node.transpose)])
+                        [TransposeHistory(
+                            transpose_node,
+                            transpose_node.in_dims[0].shape,
+                            transpose_node.transpose,
+                            transpose_node.out_dims[0].shape)])
                     cur_actions_down += this_actions_down
                     cur_visited_down |= this_visited_down
             except CantContinueError:
@@ -876,6 +1037,8 @@ def eliminate_transposes(G, debug_function=None, one_cycle=False):
                 actions += cur_actions_up
                 visited_nodes |= set(cur_visited_up.nodes)
                 visited_nodes.add(transpose_node)
+                if single_step:
+                    break
             # if transpose cannot be removed upwards movement push the transpose down if it actually moved
             elif down_count > 0 or (down_count == 0 and transpose_moved(G, cur_actions_down)):
                 info(
@@ -884,14 +1047,21 @@ def eliminate_transposes(G, debug_function=None, one_cycle=False):
                 actions += cur_actions_down
                 visited_nodes |= set(cur_visited_down.nodes)
                 visited_nodes.add(transpose_node)
+                if single_step:
+                    break
+            else:
+                info(
+                    f'no elimination for {transpose_node.name} found')
 
         if found_results:
             info("eliminate transposes")
             apply_actions(G, actions)
         else:
             info("no transposes to eliminate found")
-        remove_silly_reshapes(G)
-        combine_transposes(G)
+        if do_silly:
+            remove_silly_reshapes(G)
+            combine_reshapes(G)
+            combine_transposes(G)
         G.add_dimensions()
         if debug_function:
             debug_function(G)
diff --git a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes_actions.py b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes_actions.py
index 22410ee25..55927a224 100644
--- a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes_actions.py
+++ b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes_actions.py
@@ -27,8 +27,12 @@
 from utils.node_id import NodeId
 
 LOG = logging.getLogger("nntool." + __name__)
-LOGL = LOG.info
 
+def info(msg):
+    LOG.info(msg)
+
+def debug(msg):
+    LOG.debug(msg)
 
 class Action(ABC):
     def __init__(self, node) -> None:
@@ -50,7 +54,7 @@ def __init__(self, node, message=None) -> None:
         self.message = message
 
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
 
 
 class StartAction(DebugActionBase):
@@ -116,7 +120,7 @@ def __init__(self, node, transpose=None, reshape_from=None, reshape_to=None, **k
         self.reshape_to = reshape_to
 
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         direction = self.direction
         if self.reshape_from is not None:
             params = ReshapeParameters(G.unique_name(
@@ -150,7 +154,7 @@ def __init__(self, node, in_shape=None, out_shape=None, **kwargs) -> None:
             self.out_shape = out_shape.clone() if out_shape is not None else None
 
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         params = ReshapeParameters(G.unique_name(
             f'{node.name}_reshape'), old_shape=self.in_shape, shape=self.out_shape)
         self.do_insert(node, G, params)
@@ -158,15 +162,21 @@ def _execute(self, node, G):
     def __str__(self) -> str:
         return f"insert reshape at {self.node.name}:{self.direction}_{self.idx} in {self.in_shape} out {self.out_shape}"
 
+def make_dim(shape):
+    if shape is None:
+        return shape
+    if isinstance(shape, Dim):
+        return shape.clone()
+    return Dim.unnamed(shape)
 
 class SetReshapeAction(Action):
     def __init__(self, node, in_shape=None, out_shape=None) -> None:
         super(SetReshapeAction, self).__init__(node)
-        self.in_shape = in_shape.clone() if in_shape is not None else None
-        self.out_shape = out_shape.clone() if out_shape is not None else None
+        self.in_shape = make_dim(in_shape)
+        self.out_shape = make_dim(out_shape)
 
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         if self.in_shape is not None:
             node.old_shape = self.in_shape
         if self.out_shape is not None:
@@ -178,7 +188,7 @@ def __str__(self) -> str:
 
 class SwitchBatchLinearAction(Action):
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         self.node.batch_minor = not self.node.batch_minor
 
     def __str__(self) -> str:
@@ -195,7 +205,7 @@ def __init__(self, node, transpose_in, dir=None, transpose_out=None) -> None:
             self.transpose_out = tuple(transpose_out)
 
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         node.act_slice = [node.act_slice[idx] for idx in self.transpose_in]
         node.out_shape = [node.out_shape[idx] for idx in self.transpose_out]
 
@@ -209,7 +219,7 @@ def __init__(self, node, transpose, dir=None) -> None:
         self.transpose = tuple(transpose)
 
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         node.padding = [node.padding[idx] for idx in self.transpose]
         node.pad_vals = [node.pad_vals[idx] for idx in self.transpose]
 
@@ -223,7 +233,7 @@ def __init__(self, node, transpose, dir=None) -> None:
         self.transpose = tuple(transpose)
 
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         node.axis = self.transpose[node.axis]
 
     def __str__(self) -> str:
@@ -254,7 +264,7 @@ def from_history(cls, node, history, transpose=None):
 
 class ReorderInputDims(TransposeInputBase):
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         node.dims.transpose(self.transpose)
         if node.out_dims_hint:
             node.out_dims_hint[0] = [node.out_dims_hint[0][idx]
@@ -267,7 +277,7 @@ def __str__(self) -> str:
 
 class ReorderConstantInput(TransposeInputBase):
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         if G.quantization:
             qrec = G.quantization.get(NodeId(node), None)
         else:
@@ -293,7 +303,7 @@ def __str__(self) -> str:
 class DeleteReshapeAction(Action):
 
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         if node.name not in G:
             return
         G.remove_and_reconnect(node, edge_class=NNEdge)
@@ -308,7 +318,7 @@ def __init__(self, node, reshape=None) -> None:
         self.reshape = reshape
 
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         if node.name not in G:
             return
         if self.reshape:
@@ -342,7 +352,7 @@ def __move_transpose(self, from_dir, to_dir):
         pass
 
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         node.transpose = self.transpose
 
     def __str__(self) -> str:
@@ -358,31 +368,28 @@ def __init__(self, node, direction, transpose, shape, qrec=None) -> None:
         self.qrec = qrec
 
     @classmethod
-    def out_from_history(cls, node, history, qrec):
+    def from_history(cls, node, history, qrec, dir):
         # Find the first entry in the transpose history that actually has a transpose
         first_valid_entry = next(iter([rec
                                        for rec in reversed(history)
-                                       if rec.from_transpose]))
-        # in outwards direction the from_shape is the shape before the transpose and we
-        # want to apply the transpose to get to the shape
-        return cls(node, "out", tuple(first_valid_entry.from_transpose),
-                   tuple(first_valid_entry.from_shape), qrec=qrec)
+                                       if rec.transpose]))
+        # arriving from the top the transpose is in the down direction and from the
+        # bottom in the up direction so in both cases we need to reverse it
+        transpose = tuple(reverse_transpose(first_valid_entry.transpose))
+        # shape closest to the node
+        shape = tuple(first_valid_entry.to_shape)
+        return cls(node, dir, transpose, shape, qrec=qrec)
+
+    @classmethod
+    def out_from_history(cls, node, history, qrec):
+        return cls.from_history(node, history, qrec, "out")
 
     @classmethod
     def in_from_history(cls, node, history, qrec):
-        # Find the first entry in the transpose history that actually has a transpose
-        first_valid_entry = next(iter([rec
-                                       for rec in reversed(history)
-                                       if rec.from_transpose]))
-        # in down direction we are pushing the transpose into the FC so
-        # the from_shape is the shape with the transpose applied and
-        # we want to reverse the from_transpose to get back to the shape before it
-        transpose = tuple(reverse_transpose(first_valid_entry.from_transpose))
-        shape = tuple(first_valid_entry.from_shape)
-        return cls(node, "in", transpose, shape, qrec=qrec)
+        return cls.from_history(node, history, qrec, "in")
 
     def _execute(self, node, G):
-        LOGL("%s", str(self))
+        info(f"{self}")
         filter_node = node.contained_filters()[0] if isinstance(node, LinearFusionParameters) else node
         in_edges = G.indexed_in_edges(node.name)
         weights_node = in_edges[1].from_node
diff --git a/tools/nntool/graph/matches/matchers/move_node_up.py b/tools/nntool/graph/matches/matchers/move_node_up.py
index 49c08b4bc..76b1545ec 100644
--- a/tools/nntool/graph/matches/matchers/move_node_up.py
+++ b/tools/nntool/graph/matches/matchers/move_node_up.py
@@ -18,7 +18,8 @@
                          Conv2DParameters, FcParameters, GlobalPoolingParameters,
                          MatrixAddParameters, MatrixMulParameters, NNEdge,
                          PoolingParameters, ReluActivationParameters,
-                         ReshapeParameters, TransposeParameters)
+                         ReshapeParameters, TransposeParameters, MatMulTransposedParameters)
+from graph.types.others import ReverseParameters, StridedSliceParameters
 from graph.types.tensor_arithmetic import MatMulOpParameters
 from utils.graph import GraphView
 from utils.node_id import NodeId
@@ -128,36 +129,22 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
         return has_modified_graph
 
 
-@groups('scaled')
-@match_name("move_activations_scale8")
+@groups('*')
+@match_name("move_activations_up")
 @description("Tries to move activations so they are after layers that they can be fused with."
-             "Should be run before match_gap_ * fusions. Compatible with AutoTiler SQ8 kernels.")
+             "Should be run before match_gap_ * fusions.")
 @needs_valid_dimension(True)
-@run_before('fuse_gap_convs', 'fuse_gap_linear', 'fuse_gap_pool', 'fuse_op_activation_scale8')
+@run_before('fuse_gap_convs', 'fuse_gap_linear', 'fuse_gap_pool', 'fuse_op_activation_scale8', 'fuse_op_activation_pow2')
 class MoveActivationsMatcherScale8(MoveNodeUpMatcher):
 
-    ValidNodesToPass = (ReshapeParameters,
+    ValidNodesToPass = (ReshapeParameters, StridedSliceParameters, ReverseParameters,
                         TransposeParameters, ConcatParameters)
-    ValidFusions = (Conv2DParameters, FcParameters, PoolingParameters, PoolingParameters,
-                    GlobalPoolingParameters, MatrixAddParameters, MatrixMulParameters, MatMulOpParameters)
+    ValidFusions = (Conv2DParameters, FcParameters, PoolingParameters,
+                    GlobalPoolingParameters, MatrixAddParameters, MatrixMulParameters,
+                    MatMulOpParameters, MatMulTransposedParameters)
 
     ValidNodes = (ActivationParameters,)
 
-
-@groups('symmetric')
-@match_name("move_activations_pow2")
-@description("Tries to move activations so they are after layers that they can be fused with."
-             "Should be run before match_gap_ * fusions. Compatible with AutoTiler POW2 kernels.")
-@needs_valid_dimension(True)
-@run_before('fuse_gap_convs', 'fuse_gap_linear', 'fuse_gap_pool', 'fuse_op_activation_pow2')
-class MoveActivationsMatcherPow2(MoveNodeUpMatcher):
-
-    ValidNodesToPass = (ReshapeParameters,
-                        TransposeParameters, ConcatParameters)
-    ValidFusions = (Conv2DParameters, FcParameters, PoolingParameters)
-    ValidNodes = (ActivationParameters,)
-
-
 @groups('scaled')
 @match_name("move_pooling_scale8")
 @description("Tries to move pooling layers so they are after layers that they can be fused with."
diff --git a/tools/nntool/graph/matches/matchers/slice_to_split.py b/tools/nntool/graph/matches/matchers/slice_to_split.py
index e6414e0aa..20f601e88 100644
--- a/tools/nntool/graph/matches/matchers/slice_to_split.py
+++ b/tools/nntool/graph/matches/matchers/slice_to_split.py
@@ -202,7 +202,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool:
             diff_slices = [sl for idx, sl in enumerate(
                 slices) if idx in diff_axes]
             axis_lengths = in_edge[0].out_dims[in_edge[1]].shape
-            if min(not_diff_axes) < max(diff_axes):
+            if not_diff_axes and min(not_diff_axes) < max(diff_axes):
                 transpose_from = tuple(range(len(slices)))
                 transpose_to = tuple(diff_axes + not_diff_axes)
                 axis_lengths = [axis_lengths[idx] for idx in transpose_to]
diff --git a/tools/nntool/graph/nngraph.py b/tools/nntool/graph/nngraph.py
index eb826daa1..2d4353ef1 100644
--- a/tools/nntool/graph/nngraph.py
+++ b/tools/nntool/graph/nngraph.py
@@ -290,9 +290,9 @@ def nodes_iterator(self, yield_fusions=True):
             else:
                 yield (step_idx, node, None, None)
 
-    def adjust_order(self, reshape_weights=True, postprocess=True, debug_function=None, one_cycle=False):
+    def adjust_order(self, reshape_weights=True, postprocess=True, debug_function=None, one_cycle=False, single_step=False):
         adjust_order(self, reshape_weights=reshape_weights,
-                     postprocess=postprocess, debug_function=debug_function, one_cycle=one_cycle)
+                     postprocess=postprocess, debug_function=debug_function, one_cycle=one_cycle, single_step=single_step)
         LOG.info("adjusted order")
         self.graph_identity.is_adjusted = True
 
@@ -376,3 +376,10 @@ def __repr__(self):
         renderer = TextTableRenderer(150)
         tab.render(renderer)
         return renderer.get_output()
+
+    def total_ops(self):
+        tot_ops = 0
+        for node in self.nodes():
+            ops = node.compute_load()
+            tot_ops += ops if ops else 0
+        return tot_ops
diff --git a/tools/nntool/graph/types/conv2d.py b/tools/nntool/graph/types/conv2d.py
index bb17608d9..de85d7e01 100644
--- a/tools/nntool/graph/types/conv2d.py
+++ b/tools/nntool/graph/types/conv2d.py
@@ -15,7 +15,7 @@
 
 import logging
 
-from ..dim import DilationDim, Dim2D
+from ..dim import DilationDim, Dim, Dim2D
 from .base import (ComparableParameters, FilterLikeParameters, FilterParameters,
                    MultiplicativeBiasParameters, NoSizeChangeParameters,
                    SingleInputAndOutput, cls_op_name, SensitiveToOrder, nargs)
@@ -167,8 +167,11 @@ def get_output_size(self, in_dims):
 
         pad = self.padding.height_width()
 
-        out_dim = ((in_dim - filter_d + pad)//self.stride) + 1
-        out_dim.c = self.filter.out_c
+        h = ((in_dim.h - filter_d.h + pad.h)//self.stride.h) + 1
+        w = ((in_dim.w - filter_d.w + pad.w)//self.stride.w) + 1
+        if h < 0 or w < 0:
+            raise ValueError(f'{self.name}dimension calculation invalid {h}, {w}')
+        out_dim = Dim.named_ordered(h=h, w=w, c=self.filter.out_c)
         out_dim.impose_order(in_dim.order)
         if self.batch is not None:
             out_dim.insert_axis(0, new_name='n')
diff --git a/tools/nntool/graph/types/dsp_preprocessing.py b/tools/nntool/graph/types/dsp_preprocessing.py
index 177430e3c..237d743dc 100644
--- a/tools/nntool/graph/types/dsp_preprocessing.py
+++ b/tools/nntool/graph/types/dsp_preprocessing.py
@@ -155,11 +155,11 @@ def gen_fft_twiddles(self):
         else:
             win_lut = None
         fft_twiddles = ConstantInputParameters(self.name + "_FFT_Twiddles", value=gen_fft_twiddles_lut(
-            self.n_cfft, rad4=is_rad4), dims=Dim.unnamed([2, n_fft_lut]))
+            self.n_cfft, rad4=is_rad4), dims=Dim.unnamed([2*n_fft_lut]))
         swaptable = ConstantInputParameters(self.name + "_SwapTable", value=gen_fft_swaptable_lut(
             self.n_cfft, rad4=is_rad4), dims=Dim.unnamed([self.n_cfft]))
         rfft_twiddles = ConstantInputParameters(self.name + "_RFFT_Twiddles", value=gen_rfft_twiddles_lut(
-            self.n_fft), dims=Dim.unnamed([2, self.n_fft//2]))
+            self.n_fft), dims=Dim.unnamed([2*self.n_fft//2]))
         return win_lut, fft_twiddles, swaptable, rfft_twiddles
 
 @cls_op_name("RFFT2D")
@@ -352,8 +352,8 @@ def gen_melfilter(self):
         return melfilt_coeff_sparse_node, melfilt_sparsity_node
 
     def get_melfilter_size(self):
-        melfilt_sparsity, melfilt_coeff = self.gen_melfilter()
-        return melfilt_sparsity.dqvalue.size, melfilt_coeff.dqvalue.size
+        melfilt_coeff, melfilt_sparsity = self.gen_melfilter()
+        return melfilt_coeff.dqvalue.size, melfilt_sparsity.dqvalue.size
 
     def gen_dct_matrix(self):
         norm_factor = np.ones((self.n_dct, self.n_dct))
diff --git a/tools/nntool/graph/types/input_output.py b/tools/nntool/graph/types/input_output.py
index f39356396..9d25014a4 100644
--- a/tools/nntool/graph/types/input_output.py
+++ b/tools/nntool/graph/types/input_output.py
@@ -24,6 +24,7 @@
 
 LOG = logging.getLogger("nntool." + __name__)
 
+
 @not_generated
 class InputOutputParameters(Parameters):
 
@@ -44,7 +45,8 @@ def graph_anon_label(self):
 
     @property
     def graph_label(self):
-        shape = self.out_dims[0] if self.out_dims and self.out_dims[0] else "No Shape!"
+        shape = (self.out_dims[0] if self.out_dims and
+                 self.out_dims[0] is not None else "No Shape!")
         if self.fixed_order:
             return [self.name, f'{shape}', "Frozen Order"]
         return [self.name, f'{shape}']
@@ -129,7 +131,6 @@ def __call__(self, graph):
             raise ValueError('expecting NNGraph as parameter')
         return NNNodeRef(self, 0, graph)
 
-
     def verify(self, G):
         problems = []
         for edge in G.in_edges(self.name):
@@ -153,9 +154,6 @@ def set_input(self, value):
         self.output_value = value
 
 
-
-
-
 @cls_op_name('output')
 class OutputParameters(InputOutputParameters, InsensitiveToQuantization):
 
diff --git a/tools/nntool/graph/types/pooling.py b/tools/nntool/graph/types/pooling.py
index cac79088b..df274d955 100644
--- a/tools/nntool/graph/types/pooling.py
+++ b/tools/nntool/graph/types/pooling.py
@@ -13,11 +13,11 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from copy import deepcopy
 import logging
+from copy import deepcopy
 
-from ..dim import PoolFilterDim
-from .base import FilterLikeParameters, cls_op_name, SensitiveToOrder
+from ..dim import Dim, PoolFilterDim
+from .base import FilterLikeParameters, SensitiveToOrder, cls_op_name
 
 LOG = logging.getLogger("nntool." + __name__)
 
@@ -70,9 +70,13 @@ def get_output_size(self, in_dims):
             self.padding.calculate_same(in_dims, self.filter, self.stride)
 
         pad = self.padding.height_width()
-        out_dim = ((in_dims - self.filter + pad)//self.stride) + 1
 
-        out_dim.c = in_dims.c
+        h = ((in_dims.h - self.filter.h + pad.h)//self.stride.h) + 1
+        w = ((in_dims.w - self.filter.w + pad.w)//self.stride.w) + 1
+        if h < 0 or w < 0:
+            raise ValueError(f'{self.name}dimension calculation invalid {h}, {w}')
+
+        out_dim = Dim.named_ordered(c=in_dims.c, h=h, w=w)
         out_dim.impose_order(in_dims.order)
 
         return [out_dim]
diff --git a/tools/nntool/graph/types/tensor_arithmetic.py b/tools/nntool/graph/types/tensor_arithmetic.py
index f21dce53d..f83ac516f 100644
--- a/tools/nntool/graph/types/tensor_arithmetic.py
+++ b/tools/nntool/graph/types/tensor_arithmetic.py
@@ -215,7 +215,11 @@ def get_parameter_size(self):
         return 0
 
     def compute_load(self):
-        return self.out_dims[0].size() * 2
+        line_m1 = self.in_dims[0].shape[-2]
+        col_m1 = self.in_dims[0].shape[-1]
+        col_m2 = self.in_dims[1].shape[-2] if isinstance(self, MatMulTransposedParameters) else self.in_dims[1].shape[-1]
+        n_mat = np.prod(self.in_dims[1].shape[:-2])
+        return n_mat * (line_m1 * col_m1 * col_m2)
 
     def get_output_size(self, in_dims):
         x_shape = list(in_dims[0].shape).copy()
@@ -260,7 +264,7 @@ def get_output_size(self, in_dims):
             remove_last = True
         else:
             remove_last = False
-        y_shape = y_shape[::-1]
+        y_shape = y_shape[:-2] + y_shape[-2:][::-1]
         x_chans = x_shape[:-2:]
         y_chans = y_shape[:-2:]
         out_chans = Dim.npbroadcast([x_chans, y_chans])
diff --git a/tools/nntool/importer/onnx/handlers/backend/conv_mixin.py b/tools/nntool/importer/onnx/handlers/backend/conv_mixin.py
index f77b5d261..ce52ae936 100644
--- a/tools/nntool/importer/onnx/handlers/backend/conv_mixin.py
+++ b/tools/nntool/importer/onnx/handlers/backend/conv_mixin.py
@@ -47,22 +47,42 @@ def conv(cls, node, quantized=False, **kwargs):
         x = inputs[0]
         x_rank = len(x[2].shape)
         x_shape = x[2].shape
-        if x_shape[0] is not None and x_shape[0] > 1:
-            batch = x_shape[0]
-            logger.warning(
-                f"{valid_name} has a non 1 batch dimension of {batch} -"
-                " this is not supported by nntool or autotiler kernels")
+
+        if x_shape[0] is not None:
+            real_in_shape = tuple(x_shape.copy())
+            if x_shape[0] > 1:
+                # support for multi batch is very limited
+                batch = x_shape[0]
+                logger.warning(
+                    f"{valid_name} has a non 1 batch dimension of {batch} -"
+                    " this is not supported by nntool or autotiler kernels")
+            else:
+                # if the batch is specified but is 1 then the input will be reshaped
+                # and the output will have the batch dim set as unknown.
+                batch = None
         else:
+            real_in_shape = tuple(x_shape[1:])
             batch = None
-        real_in_shape = deepcopy(x_shape)
-        #conv_shape = [x if idx > 0 and x is not None else 1 for idx, x in enumerate(x_shape)]
-        conv_shape = x_shape
-        if None in x_shape:
-            real_in_shape.remove(None)
+
         spatial_size = x_rank - 2
         assert spatial_size == 2 or spatial_size == 1, "only 1D and 2D convolutions supported"
+
+        # Input error checking
+        undefined = []
+        if x_shape[1] is None:
+            # cope with swapped batch and channel due to bad initial reshape
+            if x_shape[0] == 1:
+                batch = None
+                x_shape = [x_shape[1], x_shape[0]] + list(x_shape[2:])
+                real_in_shape = x_shape[1:]
+            else:
+                undefined.append(f"input channel size of filter {valid_name} must be defined.")
+
         if not all(dim is not None for dim in x_shape[-spatial_size:]):
-            raise ValueError(f"input spatial size {x_shape} of filter {valid_name} must be defined. You may need to override input dimensions.")
+            undefined.append(f"input spatial size {x_shape} of filter {valid_name} must be defined.")
+        if undefined:
+            raise ValueError(f"{' '.join(undefined)}. You may need to override input dimensions.")
+
         # M x C/group x kH x kW
         weights_idx = 3 if quantized else 1
         weights_node = inputs[weights_idx][0]
@@ -70,15 +90,15 @@ def conv(cls, node, quantized=False, **kwargs):
         weights = cls.get_constant(inputs[weights_idx])
         out_c = weights.shape[0]
         group = node.attrs.get("group", 1)
-        in_c = conv_shape[-spatial_size -
-                          1] if conv_shape[-spatial_size-1] is not None else 1
+        in_c = x_shape[1]
         filt_in_c = in_c // group
         if in_c != weights.shape[1] * group:
             raise ValueError(f'node {valid_name} has incorrect input channel '
                              f'dimension {in_c} expecting {weights.shape[1] * group}')
         if spatial_size == 1:
             filt_w = weights.shape[-1]
-            filt_h = 1
+            filt_h = h = 1
+            w = x_shape[-1]
             # create a new constant node since we are changing the shape
             weights = np.reshape(weights, (out_c, filt_in_c, filt_h, filt_w))
             weights_node = ConstantInputParameters(f'{valid_name}_weights', value=weights,
@@ -88,9 +108,14 @@ def conv(cls, node, quantized=False, **kwargs):
         else:
             filt_h = weights.shape[-2]
             filt_w = weights.shape[-1]
-        h = 1 if spatial_size == 1 else (
-            conv_shape[-2] if conv_shape[-2] is not None else 1)
-        w = conv_shape[-1] if conv_shape[-1] is not None else 1
+            h = x_shape[-2]
+            w = x_shape[-1]
+
+        conv_in_shape = (in_c, h, w)
+
+        # h = 1 if spatial_size == 1 else (
+        #     x_shape[-2] if x_shape[-2] is not None else 1)
+        # w = x_shape[-1] if x_shape[-1] is not None else 1
 
         filt_dim = Conv2DFilterDim(filt_h, filt_w,
                                    out_c, in_c=filt_in_c)
@@ -174,48 +199,86 @@ def conv(cls, node, quantized=False, **kwargs):
                           to_node=params, from_idx=0, to_idx=1))
         G.add_edge(NNEdge(from_node=biases_node,
                           to_node=params, from_idx=0, to_idx=2))
-        if conv_shape != real_in_shape:
-            # insert reshape from [xx,None,xx,xx] -> [None, xx, xx, xx]
-            rbatch_params = ReshapeParameters(f'{valid_name}_reshape_batchdim',
-                                              old_shape=Dim.unnamed(
-                                                  conv_shape),
-                                              shape=Dim.unnamed(real_in_shape))
+
+        # check if input needs a reshape
+        if conv_in_shape != real_in_shape:
+            r1_params = ReshapeParameters(f'{valid_name}_reshape_in',
+                                          old_shape=Dim.unnamed(real_in_shape),
+                                          shape=Dim.unnamed(conv_in_shape))
             G.add_edge(
-                NNEdge(from_node=x[0], to_node=rbatch_params, from_idx=x[1], to_idx=0))
-            prev_node = rbatch_params
-            prev_idx = 0
+                NNEdge(from_node=x[0], to_node=r1_params, from_idx=x[1], to_idx=0))
+            G.add_edge(NNEdge(from_node=r1_params,
+                              to_node=params, from_idx=0, to_idx=0))
         else:
-            prev_node = x[0]
-            prev_idx = x[1]
+            G.add_edge(
+                NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0))
 
+        # check if output needs a reshape
         if spatial_size == 1:
             if batch is not None:
-                oned_in_shape = [batch, in_c, w]
-                twod_in_shape = [batch, in_c, 1, w]
                 oned_out_shape = [batch, out_dims[0].c, out_dims[0].w]
                 pout_dims = ProvisionalDim(oned_out_shape)
             else:
-                oned_in_shape = [in_c, w]
-                twod_in_shape = [in_c, 1, w]
                 oned_out_shape = [out_dims[0].c, out_dims[0].w]
-                pout_dims = ProvisionalDim([conv_shape[0]] + oned_out_shape)
-            r1_params = ReshapeParameters(f'{valid_name}_reshape2d',
-                                          old_shape=Dim.unnamed(oned_in_shape),
-                                          shape=Dim.unnamed(twod_in_shape))
-            r2_params = ReshapeParameters(f'{valid_name}_reshape1d',
+                pout_dims = ProvisionalDim([None] + oned_out_shape)
+
+            r2_params = ReshapeParameters(f'{valid_name}_reshape_out',
                                           old_shape=out_dims[0],
                                           shape=Dim.unnamed(oned_out_shape))
-            G.add_edge(
-                NNEdge(from_node=prev_node, to_node=r1_params, from_idx=prev_idx, to_idx=0))
-            G.add_edge(NNEdge(from_node=r1_params,
-                              to_node=params, from_idx=0, to_idx=0))
             G.add_edge(NNEdge(from_node=params,
                               to_node=r2_params, from_idx=0, to_idx=0))
-            all_nodes[node.output[0]] = (r2_params, 0, pout_dims, o_qtype)
-            return r2_params
+            params = r2_params
         else:
-            pout_dims = ProvisionalDim([conv_shape[0]] + out_dims[0].shape)
-            G.add_edge(
-                NNEdge(from_node=prev_node, to_node=params, from_idx=prev_idx, to_idx=0))
-            all_nodes[node.output[0]] = (params, 0, pout_dims, o_qtype)
-            return params
+            pout_dims = ProvisionalDim([batch] + out_dims[0].shape)
+
+        all_nodes[node.output[0]] = (params, 0, pout_dims, o_qtype)
+        return params
+
+        
+            
+
+
+        # #     # insert reshape from [xx,None,xx,xx] -> [None, xx, xx, xx]
+        # #     rbatch_params = ReshapeParameters(f'{valid_name}_reshape_batchdim',
+        # #                                       old_shape=Dim.unnamed(
+        # #                                           conv_shape),
+        # #                                       shape=Dim.unnamed(real_in_shape))
+        # #     G.add_edge(
+        # #         NNEdge(from_node=x[0], to_node=rbatch_params, from_idx=x[1], to_idx=0))
+        # #     prev_node = rbatch_params
+        # #     prev_idx = 0
+        # # else:
+        # #     prev_node = x[0]
+        # #     prev_idx = x[1]
+
+        # if spatial_size == 1:
+        #     if batch is not None:
+        #         oned_in_shape = [batch, in_c, w]
+        #         twod_in_shape = [batch, in_c, 1, w]
+        #         oned_out_shape = [batch, out_dims[0].c, out_dims[0].w]
+        #         pout_dims = ProvisionalDim(oned_out_shape)
+        #     else:
+        #         oned_in_shape = [in_c, w]
+        #         twod_in_shape = [in_c, 1, w]
+        #         oned_out_shape = [out_dims[0].c, out_dims[0].w]
+        #         pout_dims = ProvisionalDim([conv_shape[0]] + oned_out_shape)
+        #     r1_params = ReshapeParameters(f'{valid_name}_reshape2d',
+        #                                   old_shape=Dim.unnamed(oned_in_shape),
+        #                                   shape=Dim.unnamed(twod_in_shape))
+        #     r2_params = ReshapeParameters(f'{valid_name}_reshape1d',
+        #                                   old_shape=out_dims[0],
+        #                                   shape=Dim.unnamed(oned_out_shape))
+        #     G.add_edge(
+        #         NNEdge(from_node=prev_node, to_node=r1_params, from_idx=prev_idx, to_idx=0))
+        #     G.add_edge(NNEdge(from_node=r1_params,
+        #                       to_node=params, from_idx=0, to_idx=0))
+        #     G.add_edge(NNEdge(from_node=params,
+        #                       to_node=r2_params, from_idx=0, to_idx=0))
+        #     all_nodes[node.output[0]] = (r2_params, 0, pout_dims, o_qtype)
+        #     return r2_params
+        # else:
+        #     pout_dims = ProvisionalDim([conv_shape[0]] + out_dims[0].shape)
+        #     G.add_edge(
+        #         NNEdge(from_node=prev_node, to_node=params, from_idx=prev_idx, to_idx=0))
+        #     all_nodes[node.output[0]] = (params, 0, pout_dims, o_qtype)
+        #     return params
diff --git a/tools/nntool/importer/onnx/handlers/backend/gather.py b/tools/nntool/importer/onnx/handlers/backend/gather.py
index 9a822dcfa..1a2761e3d 100644
--- a/tools/nntool/importer/onnx/handlers/backend/gather.py
+++ b/tools/nntool/importer/onnx/handlers/backend/gather.py
@@ -13,6 +13,8 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+import copy
+from graph.types.others import StridedSliceParameters
 import numpy as np
 from graph.types import ConstantInputParameters, GatherParameters, NNEdge
 from importer.common.constant_mixin import ConstantMixin
@@ -44,8 +46,14 @@ def _common(cls, node, **kwargs):
             logger.info(f"reducing {valid_name} to a constant {cls.print_small(x_val)}")
             params = ConstantInputParameters(valid_name, value=np.take(x_val, indices, axis=axis))
         else:
-            axis = cls._trim_axis(axis, x_shape)
-            params = GatherParameters(valid_name, axis=axis, indices=indices)
+            if np.ndim(indices) <= 1:
+                idx = np.asscalar(indices)
+                act_slice = tuple([(0, dim, 1) if i != axis else (idx, idx+1, 1) for i, dim in enumerate(x_shape) if dim is not None])
+                out_shape = pshape.known_shape.copy()
+                params = StridedSliceParameters(valid_name, act_slice=act_slice, out_shape=out_shape)
+            else:
+                axis = cls._trim_axis(axis, x_shape)
+                params = GatherParameters(valid_name, axis=axis, indices=indices)
             G.add_edge(NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0))
         all_nodes[node.output[0]] = (params, 0, pshape, x[3])
         return params
diff --git a/tools/nntool/importer/onnx/handlers/backend/mat_mul_mixin.py b/tools/nntool/importer/onnx/handlers/backend/mat_mul_mixin.py
index b2d6a0c6f..72c18169e 100644
--- a/tools/nntool/importer/onnx/handlers/backend/mat_mul_mixin.py
+++ b/tools/nntool/importer/onnx/handlers/backend/mat_mul_mixin.py
@@ -90,9 +90,13 @@ def _handle(cls, node, quantized=False, **kwargs):
                 )
         else:
             params = MatMulTransposedParameters(valid_name)
-            trans2 = TransposeParameters(f'{valid_name}_tin2', transpose=(1, 0))
+            trans_shape = [i for i in range(len(y_shape))]
+            temp = trans_shape[-1]
+            trans_shape[-1] = trans_shape[-2]
+            trans_shape[-2] = temp
+            trans2 = TransposeParameters(f'{valid_name}_tin2', transpose=tuple(trans_shape))
             out_dims = params.get_output_size(
-                [Dim.unnamed(x_shape), Dim.unnamed(y_shape[::-1])])
+                [Dim.unnamed(x_shape), Dim.unnamed(y_shape[:-2] + y_shape[-2:][::-1])])
             G.add_edge(
                 NNEdge(from_node=y[0], to_node=trans2, from_idx=y[1], to_idx=0))
             G.add_edge(
diff --git a/tools/nntool/importer/onnx/handlers/backend/pad_mixin.py b/tools/nntool/importer/onnx/handlers/backend/pad_mixin.py
index 6a3aef956..77c796f0a 100644
--- a/tools/nntool/importer/onnx/handlers/backend/pad_mixin.py
+++ b/tools/nntool/importer/onnx/handlers/backend/pad_mixin.py
@@ -24,6 +24,7 @@ def pad_start_with(cls, val, pad_val, dlen):
     def calc_pad_dim(cls, node, expected_len):
         if "auto_pad" not in node.attrs or node.attrs["auto_pad"] == "NOTSET":
             pads = cls.pad_start_with(node.attrs.get("pads", []), [0], expected_len)
+            pads = pads if len(pads) < 4 else [pads[0], pads[2], pads[1], pads[3]]
             pad_dim = PadDim(*pads)
         elif node.attrs["auto_pad"] == "VALID":
             pad_dim = PadDim.valid()
diff --git a/tools/nntool/quantization/float/kernels/matrix_operations.py b/tools/nntool/quantization/float/kernels/matrix_operations.py
index 15789ea37..5ee185f17 100644
--- a/tools/nntool/quantization/float/kernels/matrix_operations.py
+++ b/tools/nntool/quantization/float/kernels/matrix_operations.py
@@ -137,7 +137,8 @@ def execute(cls, params,
         if len(in_tensors) > 2:
             biases = in_tensors[2]
             if len(biases.shape) == 1:
-                biases = np.expand_dims(biases, 1 if mat2.shape[1] == 1 else 0)
+                if biases.shape[0] == mat1.shape[0]:
+                    biases = np.expand_dims(biases, -1)
         else:
             biases = 0
 
@@ -201,6 +202,7 @@ def execute(cls, params,
 
 
 class BinaryOpFloat32(KernelBase):
+    @staticmethod
     def FUNC(x, y): return x
 
     @classmethod
@@ -293,4 +295,5 @@ class SqrtFloat32(UnaryOpFloat32):
 @params_type(RSqrtOpParameters)
 @qrec_type('float')
 class RSqrtFloat32(UnaryOpFloat32):
+    @staticmethod
     def FUNC(x): return 1.0/np.sqrt(x)
diff --git a/tools/nntool/quantization/float/quantizers/conv_fusion_float.py b/tools/nntool/quantization/float/quantizers/conv_fusion_float.py
index fe714462a..8226ae742 100644
--- a/tools/nntool/quantization/float/quantizers/conv_fusion_float.py
+++ b/tools/nntool/quantization/float/quantizers/conv_fusion_float.py
@@ -14,6 +14,8 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 from copy import deepcopy
+from quantization.float.quantizers.filter_float import AT_HWC_KER_IN_ORDER, AT_HWC_KER_OUT_ORDER, AT_CHW_KER_IN_ORDER, AT_CHW_KER_OUT_ORDER
+from quantization.quantizer_options import HWC_OPTION
 
 import numpy as np
 from bfloat16 import bfloat16
@@ -24,9 +26,11 @@
 from quantization.unified_quantization_handler import (fusion_handler,
                                                        in_qs_constraint,
                                                        out_qs_constraint,
-                                                       params_type)
-
+                                                       params_type, options)
 
+@options(
+    HWC_OPTION
+)
 @params_type(ConvFusionParameters, LinearFusionParameters)
 @in_qs_constraint({'dtype': set([np.float16, np.float32, bfloat16])})
 @out_qs_constraint({'dtype': set([np.float16, np.float32, bfloat16])})
@@ -36,4 +40,9 @@ class FilterFusionFloat(FloatQuantizionHandler):
     def _quantize(cls, params, in_qs, stats, **kwargs):
         _, dtype = cls.get_float_opts(**kwargs)
         out_qs = [deepcopy(in_qs[0])]
+        opts = kwargs['opts']
+        if opts['hwc']:
+            cls.check_order(params, AT_HWC_KER_IN_ORDER, AT_HWC_KER_OUT_ORDER)
+        else:
+            cls.check_order(params, AT_CHW_KER_IN_ORDER, AT_CHW_KER_OUT_ORDER)
         return QRec.float(in_qs=in_qs, out_qs=out_qs, float_dtype=dtype)
diff --git a/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py b/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py
index 3bcf75f22..f28a6cac5 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py
@@ -52,22 +52,11 @@
 def can_ne16(fusion, params, G):
     if not isinstance(params, (Conv2DParameters, FcParameters, MatMulTransposedParameters)):
         return False
-    # if fusion:
-    #     if fusion.fusion_type in ['conv_active_pool', 'conv_active']:
-    #         if any(not isinstance(node, (Conv2DParameters, ReluActivationParameters, PoolingParameters))
-    #                 for node in fusion.contained_nodes()):
-    #             return False
-    #     else:
-    #         return False
     if isinstance(params, Conv2DParameters):
-        # if (params.filter.w != params.filter.h or (params.filter.w != 1 and params.filter.w != 3)):
-        #     return False
         if (params.is_depthwise_conv() and (params.filter.w != 3 or params.filter.h != 3)):
             return False
-        if (params.stride.size() != 1 and params.stride.shape != [2, 2]) and not ((params.filter.w == 1 or params.filter.h == 1)):
-            return False
     elif isinstance(params, MatMulTransposedParameters):
-        in_nodes = [edge.from_node for edge in G.in_edges(params)]
+        in_nodes = [edge.from_node for edge in G.indexed_in_edges(params)]
         if not isinstance(in_nodes[1], ConstantInputParameters):
             return False
     return True
diff --git a/tools/nntool/quantization/symmetric/kernels/dsp_preprocessing.py b/tools/nntool/quantization/symmetric/kernels/dsp_preprocessing.py
index 765c3a543..e6ba670bd 100644
--- a/tools/nntool/quantization/symmetric/kernels/dsp_preprocessing.py
+++ b/tools/nntool/quantization/symmetric/kernels/dsp_preprocessing.py
@@ -130,9 +130,9 @@ def execute(cls, params,
                 qrec: QRec,
                 **kwargs):
         in_data = in_tensors[0]
-        fft_twiddles = in_tensors[2]
+        fft_twiddles = np.stack([in_tensors[2][::2], in_tensors[2][1::2]], axis=0)
         swap_table = in_tensors[3]
-        rfft_twiddles = in_tensors[4]
+        rfft_twiddles = np.stack([in_tensors[4][::2], in_tensors[4][1::2]], axis=0)
 
         spectrograms = []
         for frame_idx in range(params.n_frames):
@@ -164,9 +164,9 @@ def execute(cls, params,
                 in_tensors,
                 qrec: QRec,
                 **kwargs):
-        fft_twiddles = in_tensors[2]
+        fft_twiddles = np.stack([in_tensors[2][::2], in_tensors[2][1::2]], axis=0)
         swap_table = in_tensors[3]
-        rfft_twiddles = in_tensors[4]
+        rfft_twiddles = np.stack([in_tensors[4][::2], in_tensors[4][1::2]], axis=0)
 
         mel_filterbank_sparsity_mat = in_tensors[5]
         mel_filterbank_coeff = in_tensors[6]
diff --git a/tools/nntool/quantization/symmetric/kernels/matrix_operations.py b/tools/nntool/quantization/symmetric/kernels/matrix_operations.py
index 25b3df930..fad901498 100644
--- a/tools/nntool/quantization/symmetric/kernels/matrix_operations.py
+++ b/tools/nntool/quantization/symmetric/kernels/matrix_operations.py
@@ -204,7 +204,8 @@ def execute(cls, params,
         if len(in_tensors) > 2:
             biases = in_tensors[2]
             if len(biases.shape) == 1:
-                biases = np.expand_dims(biases, 1 if mat2.shape[1] == 1 else 0)
+                if biases.shape[0] == mat1.shape[0]:
+                    biases = np.expand_dims(biases, -1)
         else:
             biases = 0
 
@@ -235,7 +236,8 @@ def execute(cls, params,
         if len(in_tensors) > 2:
             biases = in_tensors[2]
             if len(biases.shape) == 1:
-                biases = np.expand_dims(biases, 1 if mat2.shape[1] == 1 else 0)
+                if biases.shape[0] == mat1.shape[0]:
+                    biases = np.expand_dims(biases, -1)
         else:
             biases = 0
 
diff --git a/tools/nntool/utils/gen_twiddles.py b/tools/nntool/utils/gen_twiddles.py
index 786aedf0c..60020a925 100644
--- a/tools/nntool/utils/gen_twiddles.py
+++ b/tools/nntool/utils/gen_twiddles.py
@@ -9,14 +9,23 @@ def gen_fft_twiddles_lut(Nfft, Inverse=False, rad4=False):
         Twiddles_real = np.cos(-Phi)
         Twiddles_imag = np.sin(-Phi)
     if rad4:
-        return np.stack([Twiddles_real[:int(3*Nfft/4)], Twiddles_imag[:int(3*Nfft/4)]], axis=0)
-    return np.stack([Twiddles_real[:Nfft//2], Twiddles_imag[:Nfft//2]], axis=0)
+        twid = np.empty((2 * int(3*Nfft/4), ), dtype=Twiddles_real.dtype)
+        twid[0::2] = Twiddles_real[:int(3*Nfft/4)]
+        twid[1::2] = Twiddles_imag[:int(3*Nfft/4)]
+        return twid
+    twid = np.empty((2 * int(Nfft//2), ), dtype=Twiddles_real.dtype)
+    twid[0::2] = Twiddles_real[:int(Nfft//2)]
+    twid[1::2] = Twiddles_imag[:int(Nfft//2)]
+    return twid
 
 def gen_rfft_twiddles_lut(Nfft):
     Phi = (np.pi * 2 / Nfft) * np.arange(0, Nfft//2)
     Twiddles_real = np.sin(Phi)
     Twiddles_imag = np.cos(Phi)
-    return np.stack([Twiddles_real, Twiddles_imag], axis=0)
+    twid = np.empty((Twiddles_real.size + Twiddles_imag.size, ), dtype=Twiddles_real.dtype)
+    twid[0::2] = Twiddles_real
+    twid[1::2] = Twiddles_imag
+    return twid
 
 def gen_fft_swaptable_lut(Ni, rad4=False):
     if rad4:
diff --git a/utils/gapy/run.py b/utils/gapy/run.py
index bb68ccf00..e30abc17c 100644
--- a/utils/gapy/run.py
+++ b/utils/gapy/run.py
@@ -125,6 +125,10 @@ def appendArgs(top_parser: argparse.ArgumentParser, parser: argparse.ArgumentPar
                         action = "store_true",
                         help = "Launch gtkwave")
 
+    parser.add_argument("--wsl", dest = "wsl",
+                        action = None,
+                        help = "Launch command in wsl environment")
+
     [args, otherArgs] = top_parser.parse_known_args()
 
     if args.platform is not None:
@@ -164,6 +168,7 @@ def operationFunc(args, config = None, system = None):
         #if args.target is None:
         #    raise InputError('The target must be specified')
         config.set('runner/platform', args.platform)
+        config.set('runner/wsl', args.wsl)
 
         if args.binary is not None:
             config.set('runner/boot-loader', args.binary)
diff --git a/utils/gapy/runner/board/board_runner.py b/utils/gapy/runner/board/board_runner.py
index b833da654..5b9580d86 100644
--- a/utils/gapy/runner/board/board_runner.py
+++ b/utils/gapy/runner/board/board_runner.py
@@ -95,6 +95,15 @@ def exec(self):
         script = self.config.get_str('openocd/script')
         binary = self.config.get_str('runner/boot-loader')
 
+        wsl    = self.config.get_str('runner/wsl')
+        if wsl is None:
+            wsl_bin = binary
+        else:
+            path_header = '\\"//wsl$/' + wsl
+            path_footer = '\\"'
+            wsl_bin = path_header + binary + path_footer
+            script = os.environ.get('OPENOCD_CHIP_TARGET') 
+
         with open(binary, 'rb') as file:
             elffile = ELFFile(file)
             entry = elffile.header['e_entry']
@@ -138,9 +147,9 @@ def exec(self):
             else:
                 platform = self.config.get_str('runner/platform')
                 if chip_family == 'vega' or chip_family == 'gap9_v2':
-                    cmd = '%s -d0 -c "gdb_port disabled; telnet_port disabled; tcl_port disabled" -c "script %s; script %s; load_and_start_binary %s 0x%x"' % (openocd, cable, script, binary, entry)
+                    cmd = '%s -d0 -c "gdb_port disabled; telnet_port disabled; tcl_port disabled" -f "%s" -f "%s" -c "load_and_start_binary %s 0x%x"' % (openocd, cable, script, wsl_bin, entry)
                 else:
-                    cmd = "%s -d0 -c 'gdb_port disabled; telnet_port disabled; tcl_port disabled' -f %s -f %s -f tcl/jtag_boot_entry.tcl -c 'gap8_jtag_load_binary_and_start \"%s\" elf 0x%x'" % (openocd, cable, script, binary, entry)
+                    cmd = "%s -d0 -c 'gdb_port disabled; telnet_port disabled; tcl_port disabled' -f %s -f %s -f tcl/jtag_boot_entry.tcl -c 'gap8_jtag_load_binary_and_start \"%s\" elf 0x%x'" % (openocd, cable, script, wsl_bin, entry)
 
             os.chdir(self.config.get_str('gapy/work_dir'))
 
diff --git a/utils/rules/pulp_rules.mk b/utils/rules/pulp_rules.mk
index b7af23603..10d3fce53 100644
--- a/utils/rules/pulp_rules.mk
+++ b/utils/rules/pulp_rules.mk
@@ -233,8 +233,12 @@ ifdef LFS_ROOT_DIR
 override config_args += --config-opt=flash/content/partitions/lfs/root_dir=$(LFS_ROOT_DIR)
 endif
 
+ifneq ($(wsl),)
+WSL_ENV="--wsl=$(wsl)"
+endif
+
 flash:
-	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --force --binary=$(BIN) $(runner_args)
+	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --force --binary=$(BIN) $(runner_args) 
 
 flash_fs:
 	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --binary=$(BIN) $(runner_args)
@@ -246,10 +250,10 @@ run.prepare:
 	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec-prepare --binary=$(BIN) $(runner_args)
 
 run.exec:
-	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec --binary=$(BIN) $(runner_args)
+	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec --binary=$(BIN) $(runner_args) $(WSL_ENV)
 
 run:
-	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec-prepare --exec --binary=$(BIN) $(runner_args)
+	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec-prepare --exec --binary=$(BIN) $(runner_args) $(WSL_ENV)
 
 
 #$(INSTALL_DIR)/runner/run_gapuino.sh $(BUILDDIR) $(BIN) $(RAW_IMAGE_PLPBRIDGE_FLAGS)  $(PLPBRIDGE_FLAGS) $(PLPBRIDGE_EXTRA_FLAGS)

From bf0b89d1089237e5b94fa8ad4e401e3780cda95a Mon Sep 17 00:00:00 2001
From: yao <yao.zhang@greenwaves-technologies.com>
Date: Mon, 20 Dec 2021 16:08:13 +0100
Subject: [PATCH 2/3] Release 4.9.0

---
 .../pmsis/periph/i2s/wav_out_long/wav_out.c   |   4 +-
 gvsoc/gvsoc/bin/pulp-pc-info                  |   6 +
 gvsoc/gvsoc/engine/include/gv/gvsoc.hpp       |   2 +-
 gvsoc/gvsoc/engine/python/gv/gvsoc.py         |   1 +
 gvsoc/gvsoc/engine/src/trace/trace.cpp        |  26 ++-
 gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp   |   3 +-
 .../models/cpu/iss/vp/include/iss_wrapper.hpp |   1 +
 .../models/cpu/iss/vp/src/iss_wrapper.cpp     |   8 +
 .../models/pulp/udma/hyper/udma_hyper_v3.cpp  |   5 +
 .../models/pulp/udma/i2c/v4/udma_i2c.cpp      |   3 +
 .../models/pulp/udma/mram/udma_mram_v2.cpp    |   3 +
 .../models/pulp/udma/udma_v4_impl.cpp         |   8 +
 libs/gap_lib/testbench/testlib.c              |   7 +-
 .../cluster/cluster_sync/fc_to_cl_delegate.h  |   2 +-
 .../pmsis_api/include/pmsis/drivers/i2s.h     |  27 ++-
 rtos/pulp/pulpos-2/kernel/mem_slab.c          |  72 +++++++
 tools/autotiler_v3/Makefile                   |   2 +-
 tools/autotiler_v3_get/Makefile               |   2 +-
 tools/jenkins/gap_sdk_version.txt             |   2 +-
 .../new_generators/float/pool_float.py        | 189 ++++++++++++++++++
 .../graph/manipulations/adjust_order.py       |  55 ++---
 .../eliminate_transposes.py                   |  21 +-
 tools/nntool/graph/nngraph.py                 |   5 +-
 tools/nntool/graph/types/others.py            |   2 +-
 .../importer/onnx/handlers/backend/gru.py     |   7 +-
 .../importer/onnx/handlers/backend/lstm.py    |   4 +-
 .../onnx/handlers/backend/pad_mixin.py        |  23 ++-
 .../importer/onnx/handlers/backend/rnn.py     |   5 +-
 .../onnx/handlers/backend/rnn_mixin.py        |   8 +-
 .../tflite2/handlers/backend/concatenation.py |  17 +-
 .../handlers/backend/fully_connected.py       |  48 +++--
 tools/nntool/interpreter/commands/adjust.py   |  29 ++-
 .../float/quantizers/pool_float.py            |  52 +++++
 utils/bin/binary-size                         |   1 +
 utils/gap_configs/python/ips/iss/iss.py       |   6 +-
 utils/rules/pmsis_rules.mk                    |   2 +-
 36 files changed, 560 insertions(+), 98 deletions(-)
 create mode 100644 rtos/pulp/pulpos-2/kernel/mem_slab.c
 create mode 100644 tools/nntool/generation/new_generators/float/pool_float.py
 create mode 100644 tools/nntool/quantization/float/quantizers/pool_float.py

diff --git a/examples/pmsis/periph/i2s/wav_out_long/wav_out.c b/examples/pmsis/periph/i2s/wav_out_long/wav_out.c
index cb946ec1f..70b964eca 100644
--- a/examples/pmsis/periph/i2s/wav_out_long/wav_out.c
+++ b/examples/pmsis/periph/i2s/wav_out_long/wav_out.c
@@ -50,7 +50,7 @@ static int fs_write_from_L3(void *file, void *data, int size_total, struct pi_de
         if(rest_size >= INTER_BUFF_SIZE)
             size = INTER_BUFF_SIZE;
         else
-            size = INTER_BUFF_SIZE - rest_size;
+            size = rest_size;
 
         pi_ram_read(ram, (_l3_buff+l3_index), _tmp_buffer, (uint32_t) size);
         pi_fs_write(file, _tmp_buffer, size);
@@ -59,6 +59,8 @@ static int fs_write_from_L3(void *file, void *data, int size_total, struct pi_de
         rest_size = rest_size - size;
     } while (rest_size);
 
+    pmsis_l2_malloc_free(_tmp_buffer, (uint32_t) INTER_BUFF_SIZE);
+
     return 0;
 }
 
diff --git a/gvsoc/gvsoc/bin/pulp-pc-info b/gvsoc/gvsoc/bin/pulp-pc-info
index b86a4b8d8..62bff025e 100755
--- a/gvsoc/gvsoc/bin/pulp-pc-info
+++ b/gvsoc/gvsoc/bin/pulp-pc-info
@@ -3,6 +3,7 @@
 import argparse
 import os
 from subprocess import Popen, PIPE
+import sys
 
 
 parser = argparse.ArgumentParser(description='Generate PC debug info')
@@ -125,6 +126,11 @@ for f in functions:
 
 # And finally generate the output files
 
+if args.allFile is None and args.pcFile is None and args.debugFile is None and args.inlineFile is None:
+	for f in functions:
+		f.dumpAll(sys.stdout)
+
+
 # PC oriented file
 if args.allFile != None:
 	with open(args.allFile, 'w') as file:
diff --git a/gvsoc/gvsoc/engine/include/gv/gvsoc.hpp b/gvsoc/gvsoc/engine/include/gv/gvsoc.hpp
index bd9f07568..a5819c6fa 100644
--- a/gvsoc/gvsoc/engine/include/gv/gvsoc.hpp
+++ b/gvsoc/gvsoc/engine/include/gv/gvsoc.hpp
@@ -200,7 +200,7 @@ namespace gv {
          * @param id ID of the VCD event.
          * @param value The new value.
          */
-        virtual void event_update_logical(int64_t timestamp, int id, uint8_t value) = 0;
+        virtual void event_update_logical(int64_t timestamp, int id, uint64_t value) = 0;
 
         /**
          * Called by GVSOC to update the value of a bitfield VCD event.
diff --git a/gvsoc/gvsoc/engine/python/gv/gvsoc.py b/gvsoc/gvsoc/engine/python/gv/gvsoc.py
index 329df9389..04f900465 100644
--- a/gvsoc/gvsoc/engine/python/gv/gvsoc.py
+++ b/gvsoc/gvsoc/engine/python/gv/gvsoc.py
@@ -120,6 +120,7 @@ def gen_config(args, config):
 
         for binary in debug_binaries:
             full_config.set('**/debug_binaries', binary + '.debugInfo')
+            full_config.set('**/binaries', binary)
 
 
     gvsoc_config_path = os.path.join(config.get_str('gapy/work_dir'), 'gvsoc_config.json')
diff --git a/gvsoc/gvsoc/engine/src/trace/trace.cpp b/gvsoc/gvsoc/engine/src/trace/trace.cpp
index 3ef9c4602..26605db01 100644
--- a/gvsoc/gvsoc/engine/src/trace/trace.cpp
+++ b/gvsoc/gvsoc/engine/src/trace/trace.cpp
@@ -83,6 +83,8 @@ void vp::component_trace::new_trace_event_string(std::string name, trace *trace)
     trace->name = name;
     trace->path = top.get_path() + "/" + name;
 
+    trace->width = 0;
+    trace->is_real = false;
     trace->is_string = true;
     trace->pending_timestamp = -1;
     trace->bytes = 0;
@@ -409,15 +411,33 @@ void vp::trace_engine::flush_event_traces(int64_t timestamp)
             }
             else if (current->is_string)
             {
-                //this->vcd_user->event_update_logical(int id, uint8_t *value, uint8_t *flags);
+                this->vcd_user->event_update_string(timestamp, current->id, (char *)current->buffer);
             }
-            else if (current->width > 1)
+            else if (current->width > 8)
             {
+                if (current->width <= 16)
+                {
+                    this->vcd_user->event_update_logical(timestamp, current->id, *(uint16_t *)current->buffer);
+                }
+                else if (current->width <= 32)
+                {
+                    this->vcd_user->event_update_logical(timestamp, current->id, *(uint32_t *)current->buffer);
+                }
+                else if (current->width <= 64)
+                {
+                    this->vcd_user->event_update_logical(timestamp, current->id, *(uint64_t *)current->buffer);
+                }
+                else
+                {
+                    // Use bitfield
+                }
 
             }
             else
             {
-                this->vcd_user->event_update_logical(timestamp, current->id, *current->buffer);
+                uint64_t value = (uint64_t)*(current->buffer);
+
+                this->vcd_user->event_update_logical(timestamp, current->id, value);
             }
         }
         else
diff --git a/gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp b/gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp
index 182e9a408..76f4757b3 100644
--- a/gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp
+++ b/gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp
@@ -178,7 +178,7 @@ void trace_domain::check_trace_active(vp::trace *trace, int event)
     {
         for (auto &x : events_path_regex)
         {
-            if (x.second->is_path || regexec(x.second->regex, full_path.c_str(), 0, NULL, 0) == 0)
+            if ((x.second->is_path && x.second->path == full_path) || regexec(x.second->regex, full_path.c_str(), 0, NULL, 0) == 0)
             {
                 std::string file_path = x.second->file_path;                
                 vp::Event_trace *event_trace;
@@ -462,6 +462,7 @@ void trace_domain::conf_trace(int event, std::string path_str, bool enabled)
     const char *file_path = "all.vcd";
     const char *path = path_str.c_str();
     char *delim = (char *)::index(path, '@');
+
     if (delim)
     {
         *delim = 0;
diff --git a/gvsoc/gvsoc/models/cpu/iss/vp/include/iss_wrapper.hpp b/gvsoc/gvsoc/models/cpu/iss/vp/include/iss_wrapper.hpp
index ac739ef20..f7e234e1c 100644
--- a/gvsoc/gvsoc/models/cpu/iss/vp/include/iss_wrapper.hpp
+++ b/gvsoc/gvsoc/models/cpu/iss/vp/include/iss_wrapper.hpp
@@ -151,6 +151,7 @@ class iss_wrapper : public vp::component, vp::Gdbserver_core
   vp::trace     inline_trace_event;
   vp::trace     line_trace_event;
   vp::trace     file_trace_event;
+  vp::trace     binaries_trace_event;
   vp::trace     pcer_trace_event[32];
   vp::trace     insn_trace_event;
 
diff --git a/gvsoc/gvsoc/models/cpu/iss/vp/src/iss_wrapper.cpp b/gvsoc/gvsoc/models/cpu/iss/vp/src/iss_wrapper.cpp
index 4559223c9..69df9a4e0 100644
--- a/gvsoc/gvsoc/models/cpu/iss/vp/src/iss_wrapper.cpp
+++ b/gvsoc/gvsoc/models/cpu/iss/vp/src/iss_wrapper.cpp
@@ -1300,6 +1300,7 @@ int iss_wrapper::build()
   traces.new_trace_event_string("func", &func_trace_event);
   traces.new_trace_event_string("inline_func", &inline_trace_event);
   traces.new_trace_event_string("file", &file_trace_event);
+  traces.new_trace_event_string("binaries", &binaries_trace_event);
   traces.new_trace_event("line", &line_trace_event, 32);
 
   traces.new_trace_event_real("ipc_stat", &ipc_stat_event);
@@ -1423,6 +1424,13 @@ void iss_wrapper::start()
     iss_register_debug_info(this, x->get_str().c_str());
   }
 
+  if (this->get_js_config()->get("**/binaries") != NULL)
+  {
+    for (auto x:this->get_js_config()->get("**/binaries")->get_elems())
+    {
+      this->binaries_trace_event.event_string("static enable " + x->get_str());
+    }
+  }
 
   trace.msg("ISS start (fetch: %d, is_active: %d, boot_addr: 0x%lx)\n", fetch_enable_reg.get(), is_active_reg.get(), get_config_int("boot_addr"));
 
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/hyper/udma_hyper_v3.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/hyper/udma_hyper_v3.cpp
index 135ddc6cb..7de6736f5 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/hyper/udma_hyper_v3.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/hyper/udma_hyper_v3.cpp
@@ -83,6 +83,8 @@ Hyper_periph::Hyper_periph(udma *top, int id, int itf_id) : Udma_periph(top, id)
     this->read_req_waiting = new Udma_queue<Hyper_read_request>(-1);
     this->read_req_ready = new Udma_queue<Hyper_read_request>(-1);
 
+    memset((void *)&this->ca, 0, sizeof(this->ca));
+
 }
 
 
@@ -191,6 +193,9 @@ void Hyper_periph::reset(bool active)
 {
     Udma_periph::reset(active);
 
+    this->rx_channel->reset(active);
+    this->tx_channel->reset(active);
+
     if (!active)
     {
         this->pending_tx = false;
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/i2c/v4/udma_i2c.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/i2c/v4/udma_i2c.cpp
index a1a71b451..8a589314f 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/i2c/v4/udma_i2c.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/i2c/v4/udma_i2c.cpp
@@ -141,6 +141,9 @@ void I2c_periph::reset(bool active)
 {
     Udma_periph::reset(active);
 
+    this->foll_rx_channel->reset(active);
+    this->foll_tx_channel->reset(active);
+
     if (active)
     {
     }
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/mram/udma_mram_v2.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/mram/udma_mram_v2.cpp
index d8d98a74c..0a944fd4e 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/mram/udma_mram_v2.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/mram/udma_mram_v2.cpp
@@ -508,6 +508,9 @@ void Mram_periph::reset(bool active)
 
     this->read_fifo->reset(active);
 
+    this->rx_channel->reset(active);
+    this->tx_channel->reset(active);
+
     if (!active)
     {
         this->pending_transfer_size = 0;
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_impl.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_impl.cpp
index e58dad15b..127890b20 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_impl.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_impl.cpp
@@ -709,6 +709,14 @@ void udma::start()
 
 void udma::reset(bool active)
 {
+    for (auto x: this->periphs)
+    {
+        if (x)
+        {
+            x->reset(active);
+        }
+    }
+
     for (auto x: this->addrgen_linear)
     {
         x->reset(active);
diff --git a/libs/gap_lib/testbench/testlib.c b/libs/gap_lib/testbench/testlib.c
index 1619bdc2a..063221819 100644
--- a/libs/gap_lib/testbench/testlib.c
+++ b/libs/gap_lib/testbench/testlib.c
@@ -185,6 +185,7 @@ void i2s_slot_deinit(i2s_slot_test_t *i2s_slot)
         struct pi_i2s_channel_conf i2s_conf;
         pi_i2s_channel_conf_init(&i2s_conf);
         i2s_conf.options = PI_I2S_OPT_DISABLED | (i2s_slot->is_rx ? PI_I2S_OPT_IS_RX: PI_I2S_OPT_IS_TX);
+
         if (i2s_slot->frame)
         {
             pi_i2s_frame_channel_conf_set(i2s_slot->i2s, i2s_slot->frame, i2s_slot->slot, &i2s_conf);
@@ -237,11 +238,11 @@ int i2s_slot_init(i2s_test_t *test, i2s_slot_test_t *i2s_slot, struct pi_device
 
         if (slot_config->is_rx)
         {
-            i2s_conf.options = PI_I2S_OPT_PINGPONG | PI_I2S_OPT_IS_RX | PI_I2S_OPT_ENABLED;
+            i2s_conf.options = PI_I2S_OPT_IS_RX | PI_I2S_OPT_ENABLED;
         }
         else
         {
-            i2s_conf.options = PI_I2S_OPT_PINGPONG | PI_I2S_OPT_IS_TX | PI_I2S_OPT_ENABLED;
+            i2s_conf.options = PI_I2S_OPT_IS_TX | PI_I2S_OPT_ENABLED;
         }
 
         if (slot_config->slab)
@@ -315,10 +316,12 @@ int i2s_slot_init(i2s_test_t *test, i2s_slot_test_t *i2s_slot, struct pi_device
 
         if (slot_config->slab)
         {
+            i2s_conf.options |= PI_I2S_OPT_MEM_SLAB;
             i2s_conf.mem_slab = &i2s_slot->slab;
         }
         else
         {
+            i2s_conf.options |= PI_I2S_OPT_PINGPONG;
             i2s_conf.pingpong_buffers[0] = i2s_slot->buffers[0];
             i2s_conf.pingpong_buffers[1] = i2s_slot->buffers[1];
         }
diff --git a/rtos/pmsis/pmsis_api/include/pmsis/cluster/cluster_sync/fc_to_cl_delegate.h b/rtos/pmsis/pmsis_api/include/pmsis/cluster/cluster_sync/fc_to_cl_delegate.h
index 04a238380..a4bf4a5ab 100644
--- a/rtos/pmsis/pmsis_api/include/pmsis/cluster/cluster_sync/fc_to_cl_delegate.h
+++ b/rtos/pmsis/pmsis_api/include/pmsis/cluster/cluster_sync/fc_to_cl_delegate.h
@@ -207,7 +207,7 @@ static inline int pi_cluster_send_task_async(struct pi_device *device,
 
 int pi_cluster_send_task_to_cl(struct pi_device *device, struct pi_cluster_task *task);
 int pi_cluster_send_task_to_cl_async(struct pi_device *device, struct pi_cluster_task *cluster_task, pi_task_t *task);
-int pi_cluster_send_tasklet_to_cl_async(struct pi_device *device, struct pi_cluster_task *cluster_task, pi_task_t *task);
+void pi_cluster_send_tasklet_to_cl(struct pi_device *device, struct pi_cluster_task *cluster_task);
 
 static inline int pi_cluster_send_task(struct pi_device *device, struct pi_cluster_task *task)
 {
diff --git a/rtos/pmsis/pmsis_api/include/pmsis/drivers/i2s.h b/rtos/pmsis/pmsis_api/include/pmsis/drivers/i2s.h
index 62c560de7..2a170bcc5 100644
--- a/rtos/pmsis/pmsis_api/include/pmsis/drivers/i2s.h
+++ b/rtos/pmsis/pmsis_api/include/pmsis/drivers/i2s.h
@@ -524,25 +524,38 @@ struct pi_i2s_conf
  * \struct pi_i2s_channel_conf
  *
  * \brief Interface channel configuration options. This configuration has to be
- * used when configuring a channel in TDM mode.
+ * used when configuring a channel in TDM mode. This can also be used to configure
+ * channels when they are part of a frame. Be careful in that case that some fields
+ * are not specific restrictions in this case.
  */
 struct pi_i2s_channel_conf
 {
     size_t block_size;          /*!< Size of one RX/TX memory block(buffer) in bytes.
                                   On some chips, this size may have to be set under a
-                                  maximum size, check the chip-specific section. */
-    pi_mem_slab_t *mem_slab;    /*!< memory slab to store RX/TX data. */
+                                  maximum size, check the chip-specific section.
+                                  In frame-based mode, this field should be the same for
+                                  all channels. */
+    pi_mem_slab_t *mem_slab;    /*!< memory slab to store RX/TX data.
+                                  In frame-based mode, this field should be the same for
+                                  all channels. */
     void *pingpong_buffers[2];  /*!< Pair of buffers used in double-bufferin
-                                  mode to capture the incoming samples.  */
+                                  mode to capture the incoming samples.
+                                  In frame-based mode, this field should be the same for
+                                  all channels.  */
     pi_i2s_fmt_t format;        /*!< Data stream format as defined by PI_I2S_FMT_* constants. */
     uint8_t word_size;          /*!< Number of bits representing one data word. */
     int8_t mem_word_size;       /*!< Number of bits representing one data word in memory.
-                                  If it is -1, this is equal to word_size. */
+                                  If it is -1, this is equal to word_size.
+                                  In frame-based mode, this field should be the same for
+                                  all channels. */
     pi_i2s_opt_t options;       /*!< Configuration options as defined by PI_I2S_OPT_* constants. */
     int8_t asrc_channel;        /*!< If different from -1, this redirect the specified
                                   stream(can be input or output) to/from the ASRC block with
-                                  the channel specified here. */
-    uint8_t ts_evt_id;           /*!< UDMA Config Event ID for generating the timestamp */
+                                  the channel specified here. 
+                                  In frame-based mode, this field should be -1.*/
+    uint8_t ts_evt_id;           /*!< UDMA Config Event ID for generating the timestamp.
+                                  In frame-based mode, this field should be the same for
+                                  all channels.  */
     uint8_t slot_enable;        /*!< Specifies if the corresponding slot must be enabled or not.
         It is by default set to 1. */
 };
diff --git a/rtos/pulp/pulpos-2/kernel/mem_slab.c b/rtos/pulp/pulpos-2/kernel/mem_slab.c
new file mode 100644
index 000000000..e9457b4b2
--- /dev/null
+++ b/rtos/pulp/pulpos-2/kernel/mem_slab.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016 Wind River Systems, Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "pmsis.h"
+
+static void create_free_list(struct pi_mem_slab *slab)
+{
+    uint32_t j;
+    char *p;
+
+    slab->free_list = NULL;
+    p = slab->buffer + slab->block_size*(slab->num_blocks - 1);
+
+    for (j = 0U; j < slab->num_blocks; j++) {
+        *(char **)p = slab->free_list;
+        slab->free_list = p;
+        p -= slab->block_size;
+    }
+}
+
+
+void pi_mem_slab_init(pi_mem_slab_t *slab, void *buffer,
+            size_t block_size, uint32_t num_blocks)
+{
+    slab->num_blocks = num_blocks;
+    slab->block_size = block_size;
+    slab->buffer = buffer;
+    slab->num_used = 0U;
+    create_free_list(slab);
+}
+
+
+int pi_mem_slab_alloc(pi_mem_slab_t *slab, void **mem, int32_t timeout)
+{
+    int result;
+
+    int irq = hal_irq_disable();
+
+    if (slab->free_list != NULL)
+    {
+        /* take a free block */
+        *mem = slab->free_list;
+        slab->free_list = *(char **)(slab->free_list);
+        slab->num_used++;
+        result = 0;
+    }
+    else
+    {
+        /* don't wait for a free block to become available */
+        *mem = NULL;
+        result = -1;
+    }
+
+    hal_irq_restore(irq);
+    
+    return result;
+}
+
+
+void pi_mem_slab_free(pi_mem_slab_t *slab, void **mem)
+{
+    int irq = hal_irq_disable();
+
+    **(char ***)mem = slab->free_list;
+    slab->free_list = *(char **)mem;
+    slab->num_used--;
+
+    hal_irq_restore(irq);
+}
diff --git a/tools/autotiler_v3/Makefile b/tools/autotiler_v3/Makefile
index c6f308357..dc12e7fd7 100644
--- a/tools/autotiler_v3/Makefile
+++ b/tools/autotiler_v3/Makefile
@@ -1,4 +1,4 @@
-TILER_VER=4.1.0
+TILER_VER=4.3.0
 export TILER_LIB=libtile.${TILER_VER}.a
 ifdef GAP_SDK_HOME
 export TILER_URL=$(GAP_SDK_HOME)/.tiler_url
diff --git a/tools/autotiler_v3_get/Makefile b/tools/autotiler_v3_get/Makefile
index c6f308357..dc12e7fd7 100644
--- a/tools/autotiler_v3_get/Makefile
+++ b/tools/autotiler_v3_get/Makefile
@@ -1,4 +1,4 @@
-TILER_VER=4.1.0
+TILER_VER=4.3.0
 export TILER_LIB=libtile.${TILER_VER}.a
 ifdef GAP_SDK_HOME
 export TILER_URL=$(GAP_SDK_HOME)/.tiler_url
diff --git a/tools/jenkins/gap_sdk_version.txt b/tools/jenkins/gap_sdk_version.txt
index 1f8776c61..dbbe1c752 100644
--- a/tools/jenkins/gap_sdk_version.txt
+++ b/tools/jenkins/gap_sdk_version.txt
@@ -1 +1 @@
-0addfcd309a48d5a5dbd3e058de0cf1ffc660a2b
+1f3f3d29ae92199246b03f00e4c207540022b502
diff --git a/tools/nntool/generation/new_generators/float/pool_float.py b/tools/nntool/generation/new_generators/float/pool_float.py
new file mode 100644
index 000000000..40aac877d
--- /dev/null
+++ b/tools/nntool/generation/new_generators/float/pool_float.py
@@ -0,0 +1,189 @@
+# Copyright (C) 2021  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from functools import reduce
+from utils.largest_factor import balanced_divisors
+from quantization.multiplicative.mulbias import compute_in_out_scale
+from generation.new_generators.helpers.in_out_bindings_mixin import InOutBindingsMixin
+from graph.types.global_pooling import GlobalAveragePoolParameters, GlobalSumPoolParameters
+from graph.dim import PadDim
+from generation.new_generators.helpers.act_infos import gen_act_infos
+from generation.at_types.at_params import NO_POOL, gen_activation_op, gen_globalpool_at_params, gen_pool_at_params
+import logging
+from utils.node_id import NodeId
+
+import numpy as np
+from generation.at_types.constant_info import ConstantInfo
+from generation.at_types.gen_ctrl import GenCtrl
+from generation.at_types.tc_arg_info import GlobalArgInfo
+from generation.generators.globals.global_names import INFOS
+from generation.generators.kernels.autotiler_kernel import NewAutoTilerKernel
+from generation.helpers.gen_constant import gen_constant
+from generation.new_generators.generator_base import (GeneratorBase,
+                                                      paramstype, ktype)
+from graph.types import GlobalPoolingParameters, PoolingParameters, ActivationFusion
+from quantization.qtype import QType
+
+LOG = logging.getLogger("nntool." + __name__)
+
+@paramstype(ActivationFusion, GlobalPoolingParameters, PoolingParameters)
+@ktype('float')
+class PoolActGenerator(GeneratorBase, InOutBindingsMixin):
+
+    @classmethod
+    def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
+        return True
+
+    @classmethod
+    def bindings_generator(cls, gen, node, qrec, in_eparams, out_eparams, cname) -> bool:
+        if isinstance(node, ActivationFusion):
+            cnodes = node.contained_nodes()
+            if isinstance(cnodes[0], (GlobalPoolingParameters, PoolingParameters)):
+                cls.set_in_out_bindings(gen, in_eparams, out_eparams, cname, cnodes[0], qrec)
+                return True
+            return False
+        elif isinstance(node, (GlobalPoolingParameters, PoolingParameters)):
+            cls.set_in_out_bindings(gen, in_eparams, out_eparams, cname, node, qrec)
+        else:
+            return False
+        return True
+
+    @classmethod
+    def kernel_generator(cls, gen, node, qrec, in_eparams, out_eparams, cname) -> bool:
+        if isinstance(node, (GlobalPoolingParameters, PoolingParameters)):
+            pool_kernel = PoolActKernel if isinstance(node, PoolingParameters) else GlobalPoolActKernel
+            gen.kernels.append(pool_kernel(node.name, cname, node, qrec, None, None,
+                                           force_relu=gen.force_relu, gen_ctrl=node.get_gen_ctrl()))
+        elif isinstance(node, ActivationFusion):
+            cnodes = node.contained_nodes()
+            quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes]
+            pool_kernel = PoolActKernel if isinstance(cnodes[0], PoolingParameters) else GlobalPoolActKernel
+            gen.kernels.append(pool_kernel(node.name, cname, cnodes[0], quants[0], cnodes[1], quants[1],
+                                           force_relu=gen.force_relu, gen_ctrl=node.get_gen_ctrl()))
+        return True
+
+class PoolActKernel(NewAutoTilerKernel):
+    CALL_TEMPLATE = """// generator for {node_name}
+CNN_PoolAct_fp16("{cname}", {gen_ctrl},
+                 {feat}, {feat}, {width}, {height},
+                 {kop_pool}, {fpx}, {fpy}, {dpx}, {dpy}, {spx}, {spy}, {pool_pad},
+                 {kop_act});
+"""
+    def __init__(self, node_name, cname, pool_params, pool_q, act_params, act_q, force_relu, gen_ctrl=None):
+        if gen_ctrl is None:
+            self.gen_ctrl = gen_ctrl = GenCtrl(None, cname=cname)
+        else:
+            gen_ctrl.cname = cname
+            self.gen_ctrl = gen_ctrl
+        if pool_params.ker_in_order and pool_params.ker_in_order[0] == ["h", "w", "c"]:
+            hwc = True
+            gen_ctrl.hwc = 1
+
+        pad_compatibilities = []
+        at_pool_params = gen_pool_at_params(
+            pool_params, pad_compatibilities)
+        in_dim = pool_params.in_dims[0]
+        out_dim = pool_params.out_dims[0]
+        in_q = pool_q.in_qs[0]
+        out_q = pool_q.out_qs[0]
+
+        if act_params is not None:
+            act_op = gen_activation_op(
+                act_params.activation, force_relu=force_relu, asymmetric=act_q.in_qs[0].zero_point != 0)
+            if out_dim is None:
+                out_dim = act_params.out_dims[0].expand_to_chw()
+            out_q = act_q.out_qs[0]
+        else:
+            act_op = "KOP_NONE"
+
+        if pad_compatibilities:
+            reduction = PadDim.pad_compatibility_reduce(*pad_compatibilities,
+                                                        "convolution padding is not compatible with pool padding")
+            if not reduction[2]:  # default is balanced pad left
+                at_pad_ctrl = next(i for i, v in enumerate(reduction) if v)
+                LOG.debug("%s: generating pad control block", node_name)
+                self.gen_ctrl.PadType = at_pad_ctrl
+
+        attrs = {
+            'in_size': in_q.dtype_bits//8 if in_q.signed else -in_q.dtype_bits//8,
+            'out_size': out_q.dtype_bits//8 if out_q.signed else -out_q.dtype_bits//8,
+            'feat': in_dim.c,
+            'width': in_dim.w,
+            'height': in_dim.h,
+            'kop_pool': at_pool_params.PoolOper,
+            'fpx': at_pool_params.Fpx,
+            'fpy': at_pool_params.Fpy,
+            'dpx': at_pool_params.Dpx,
+            'dpy': at_pool_params.Dpy,
+            'spx': at_pool_params.Spx,
+            'spy': at_pool_params.Spy,
+            'pool_pad': at_pool_params.PoolPad,
+            'kop_act': act_op
+        }
+
+        extra_attrs = {
+            'cname': cname,
+            'node_name': node_name
+        }
+        super().__init__(attrs, extra_attrs, gen_ctrl=gen_ctrl)
+
+class GlobalPoolActKernel(NewAutoTilerKernel):
+    CALL_TEMPLATE = """// generator for {node_name}
+CNN_GlobalPoolAct_fp16("{cname}", {gen_ctrl},
+                       {feat}, {feat}, {width}, {height},
+                       {kop_pool}, {kop_act});
+"""
+    def __init__(self, node_name, cname, pool_params, pool_q, act_params, act_q, force_relu, gen_ctrl=None):
+        if gen_ctrl is None:
+            self.gen_ctrl = gen_ctrl = GenCtrl(None, cname=cname)
+        else:
+            gen_ctrl.cname = cname
+            self.gen_ctrl = gen_ctrl
+
+        at_pool_params = gen_globalpool_at_params(pool_params)
+        in_dim = pool_params.in_dims[0]
+        out_dim = pool_params.out_dims[0]
+        in_q = pool_q.in_qs[0]
+        out_q = pool_q.out_qs[0]
+        reduce_sz = reduce(lambda x, y: x * y, (sz for idx, sz in enumerate(in_dim.shape)
+                                                if idx not in pool_params.axis), 1)
+        #self.c = in_dim.size()/reduce_sz
+        feat = reduce_sz
+        height, width = balanced_divisors(in_dim.size()/reduce_sz)
+
+        if act_params is not None:
+            act_op = gen_activation_op(
+                act_params.activation, force_relu=force_relu, asymmetric=act_q.in_qs[0].zero_point != 0)
+            if out_dim is None:
+                out_dim = act_params.out_dims[0].expand_to_chw()
+            out_q = act_q.out_qs[0]
+        else:
+            act_op = "KOP_NONE"
+
+        attrs = {
+            'in_size': in_q.dtype_bits//8 if in_q.signed else -in_q.dtype_bits//8,
+            'out_size': out_q.dtype_bits//8 if out_q.signed else -out_q.dtype_bits//8,
+            'feat': feat,
+            'width': width,
+            'height': height,
+            'kop_pool': at_pool_params.GlobalPoolOper,
+            'kop_act': act_op
+        }
+
+        extra_attrs = {
+            'cname': cname,
+            'node_name': node_name
+        }
+        super().__init__(attrs, extra_attrs, gen_ctrl=gen_ctrl)
diff --git a/tools/nntool/graph/manipulations/adjust_order.py b/tools/nntool/graph/manipulations/adjust_order.py
index 4067c3397..ef489fb3d 100644
--- a/tools/nntool/graph/manipulations/adjust_order.py
+++ b/tools/nntool/graph/manipulations/adjust_order.py
@@ -26,34 +26,35 @@
 LOG = logging.getLogger("nntool." + __name__)
 
 
-def adjust_order(G, reshape_weights=True, postprocess=True, debug_function=None, one_cycle=False, single_step=False):
-    opts = {'reshape_weights': reshape_weights}
-    selector = AdjusterBase.get_all_handlers(opts)
-    LOG.info("adding transposes to correct tensor order for AT kernels")
-    ConstantInputParameters.clear_compression_state(G)
-    for node in G.nodes(node_classes=tuple(selector)):
-        adjusters = selector[node.__class__]
-        for adjuster, attrs in adjusters:
-            if attrs:
-                not_selected = False
-                for attr, val in attrs.items():
-                    if not hasattr(node, attr):
-                        not_selected = True
-                        break
-                    if callable(val):
-                        if not val(getattr(node, attr)):
+def adjust_order(G, reshape_weights=True, postprocess=True, debug_function=None, steps=None, single_step=False):
+    if steps is None:
+        opts = {'reshape_weights': reshape_weights}
+        selector = AdjusterBase.get_all_handlers(opts)
+        LOG.info("adding transposes to correct tensor order for AT kernels")
+        ConstantInputParameters.clear_compression_state(G)
+        for node in G.nodes(node_classes=tuple(selector)):
+            adjusters = selector[node.__class__]
+            for adjuster, attrs in adjusters:
+                if attrs:
+                    not_selected = False
+                    for attr, val in attrs.items():
+                        if not hasattr(node, attr):
                             not_selected = True
                             break
-                    elif getattr(node, attr) != val:
-                        not_selected = True
-                        break
-                if not_selected:
-                    continue
-            adjuster.adjust(G, node)
-            break
-    add_dimensions(G)
+                        if callable(val):
+                            if not val(getattr(node, attr)):
+                                not_selected = True
+                                break
+                        elif getattr(node, attr) != val:
+                            not_selected = True
+                            break
+                    if not_selected:
+                        continue
+                adjuster.adjust(G, node)
+                break
+        add_dimensions(G)
     if debug_function:
         debug_function(G)
-    if postprocess:
-        eliminate_transposes(G, debug_function=debug_function, one_cycle=one_cycle, single_step=single_step)
-        add_dimensions(G)
+    if steps is not None or postprocess:
+        eliminate_transposes(G, debug_function=debug_function, steps=steps, single_step=single_step)
+        # add_dimensions(G)
diff --git a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py
index 3be578e8d..86174f2f4 100644
--- a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py
+++ b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py
@@ -949,19 +949,23 @@ def delete_step_idx(G, action: DeleteTransposeAction):
     return G.in_edges(action.node)[0].from_node.step_idx
 
 
-def eliminate_transposes(G, debug_function=None, one_cycle=False, single_step=False, do_silly=True):
+def eliminate_transposes(G, debug_function=None, steps=None, single_step=False, do_silly=True):
     info("eliminating unnecessary transposes")
     found_results = True
     pass_count = 0
     while found_results:
+        if steps is not None:
+            if pass_count >= steps:
+                break
+        else:
+            if pass_count >= 50:
+                raise ValueError(
+                    "Sorry, eliminate transposes seems to be stuck in a loop. Please report to GreenWaves.")
         pass_count += 1
-        if pass_count > (200 if single_step else 50):
-            raise ValueError(
-                "Sorry, eliminate transposes is stuck in a loop. Please report to GreenWaves.")
         found_results = False
         visited_nodes = set()
         actions = []
-        info(f"search for transposes +++ PASS {pass_count}")
+        info(f"search for transposes +++ STEP {pass_count}")
         transposes = G.nodes(node_classes=TransposeParameters)
         while transposes:
             transpose_node = transposes.pop(0)
@@ -1037,7 +1041,7 @@ def eliminate_transposes(G, debug_function=None, one_cycle=False, single_step=Fa
                 actions += cur_actions_up
                 visited_nodes |= set(cur_visited_up.nodes)
                 visited_nodes.add(transpose_node)
-                if single_step:
+                if single_step or steps is not None:
                     break
             # if transpose cannot be removed upwards movement push the transpose down if it actually moved
             elif down_count > 0 or (down_count == 0 and transpose_moved(G, cur_actions_down)):
@@ -1047,7 +1051,7 @@ def eliminate_transposes(G, debug_function=None, one_cycle=False, single_step=Fa
                 actions += cur_actions_down
                 visited_nodes |= set(cur_visited_down.nodes)
                 visited_nodes.add(transpose_node)
-                if single_step:
+                if single_step or steps is not None:
                     break
             else:
                 info(
@@ -1065,7 +1069,4 @@ def eliminate_transposes(G, debug_function=None, one_cycle=False, single_step=Fa
         G.add_dimensions()
         if debug_function:
             debug_function(G)
-        if one_cycle:
-            LOG.info("cycle complete")
-            break
     LOG.info("no further transpose sequences found")
diff --git a/tools/nntool/graph/nngraph.py b/tools/nntool/graph/nngraph.py
index 2d4353ef1..abf34a24d 100644
--- a/tools/nntool/graph/nngraph.py
+++ b/tools/nntool/graph/nngraph.py
@@ -290,9 +290,10 @@ def nodes_iterator(self, yield_fusions=True):
             else:
                 yield (step_idx, node, None, None)
 
-    def adjust_order(self, reshape_weights=True, postprocess=True, debug_function=None, one_cycle=False, single_step=False):
+    def adjust_order(self, reshape_weights=True, postprocess=True, debug_function=None, steps=None, single_step=False):
         adjust_order(self, reshape_weights=reshape_weights,
-                     postprocess=postprocess, debug_function=debug_function, one_cycle=one_cycle, single_step=single_step)
+                     postprocess=postprocess, debug_function=debug_function,
+                     steps=steps, single_step=single_step)
         LOG.info("adjusted order")
         self.graph_identity.is_adjusted = True
 
diff --git a/tools/nntool/graph/types/others.py b/tools/nntool/graph/types/others.py
index b9c11d385..2d033305c 100644
--- a/tools/nntool/graph/types/others.py
+++ b/tools/nntool/graph/types/others.py
@@ -668,7 +668,7 @@ def __init__(self, *args, old_shape=None, shape=None, **kwargs):
         if not isinstance(shape, Dim):
             shape = Dim.unnamed(shape)
         if old_shape is not None and not isinstance(old_shape, Dim):
-            old_shape = Dim.unnamed(shape)
+            old_shape = Dim.unnamed(old_shape)
         assert shape.is_ordered and (old_shape is None or old_shape.is_ordered)
         self._shape = shape
         self._old_shape = old_shape
diff --git a/tools/nntool/importer/onnx/handlers/backend/gru.py b/tools/nntool/importer/onnx/handlers/backend/gru.py
index 2b363dc9f..6e6bb8c2d 100644
--- a/tools/nntool/importer/onnx/handlers/backend/gru.py
+++ b/tools/nntool/importer/onnx/handlers/backend/gru.py
@@ -36,11 +36,10 @@ def _common(cls, node, **kwargs):
         input_shapes = [inp[2].shape if inp else None for inp in inputs]
         x = inputs[0]
 
-        seq_len = input_shapes[0][0]
+        seq_len = input_shapes[0][1] if node.attrs.get("layout", 0) else input_shapes[0][0]
         if seq_len is None:
-            logger.warning("sequence length is variable in size. forcing to 20. "
-                           "reexport your graph with sequence length set to the maxmimum sequence size")
-            seq_len = 20
+            seq_len = 1
+
         input_size = input_shapes[0][2]
         hidden_size = node.attrs["hidden_size"]
         direction = node.attrs.get("direction", "forward")
diff --git a/tools/nntool/importer/onnx/handlers/backend/lstm.py b/tools/nntool/importer/onnx/handlers/backend/lstm.py
index 3d4eff864..c043fd030 100644
--- a/tools/nntool/importer/onnx/handlers/backend/lstm.py
+++ b/tools/nntool/importer/onnx/handlers/backend/lstm.py
@@ -35,7 +35,9 @@ def _common(cls, node, **kwargs):
         input_shapes = [inp[2].shape if inp else None for inp in inputs]
         x = inputs[0]
 
-        seq_len = input_shapes[0][0]
+        seq_len = input_shapes[0][1] if node.attrs.get("layout", 0) else input_shapes[0][0]
+        if seq_len is None:
+            seq_len = 1
         input_size = input_shapes[0][2]
         hidden_size = node.attrs["hidden_size"]
         direction = node.attrs.get("direction", "forward")
diff --git a/tools/nntool/importer/onnx/handlers/backend/pad_mixin.py b/tools/nntool/importer/onnx/handlers/backend/pad_mixin.py
index 77c796f0a..0d6c3ddd0 100644
--- a/tools/nntool/importer/onnx/handlers/backend/pad_mixin.py
+++ b/tools/nntool/importer/onnx/handlers/backend/pad_mixin.py
@@ -13,9 +13,28 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+from itertools import chain
+
 from graph.dim import PadDim
 
+
 class PadMixin(object):
+    @classmethod
+    def mix_pads(cls, val):
+        if len(val) == 0:
+            pads_start = [0, 0]
+            pads_end = [0, 0]
+        elif len(val) == 2:
+            pads_start = [0, val[0]]
+            pads_end = [0, val[1]]
+        elif len(val) == 4:
+            pads_start = val[:2]
+            pads_end = val[2:]
+        else:
+            raise ValueError('unexpected pad lenght')
+
+        return list(chain(*zip(pads_start, pads_end)))
+
     @classmethod
     def pad_start_with(cls, val, pad_val, dlen):
         return pad_val * (dlen - len(val)) + val
@@ -23,9 +42,7 @@ def pad_start_with(cls, val, pad_val, dlen):
     @classmethod
     def calc_pad_dim(cls, node, expected_len):
         if "auto_pad" not in node.attrs or node.attrs["auto_pad"] == "NOTSET":
-            pads = cls.pad_start_with(node.attrs.get("pads", []), [0], expected_len)
-            pads = pads if len(pads) < 4 else [pads[0], pads[2], pads[1], pads[3]]
-            pad_dim = PadDim(*pads)
+            pad_dim = PadDim(*cls.mix_pads(node.attrs.get("pads", [])))
         elif node.attrs["auto_pad"] == "VALID":
             pad_dim = PadDim.valid()
         elif node.attrs["auto_pad"] == "SAME_UPPER":
diff --git a/tools/nntool/importer/onnx/handlers/backend/rnn.py b/tools/nntool/importer/onnx/handlers/backend/rnn.py
index 0d51005cb..ce22a6340 100644
--- a/tools/nntool/importer/onnx/handlers/backend/rnn.py
+++ b/tools/nntool/importer/onnx/handlers/backend/rnn.py
@@ -35,7 +35,10 @@ def _common(cls, node, **kwargs):
         input_shapes = [inp[2].shape if inp else None for inp in inputs]
         x = inputs[0]
 
-        seq_len = input_shapes[0][0]
+        seq_len = input_shapes[0][1] if node.attrs.get("layout", 0) else input_shapes[0][0]
+        if seq_len is None:
+            seq_len = 1
+
         input_size = input_shapes[0][2]
         hidden_size = node.attrs["hidden_size"]
         direction = node.attrs.get("direction", "forward")
diff --git a/tools/nntool/importer/onnx/handlers/backend/rnn_mixin.py b/tools/nntool/importer/onnx/handlers/backend/rnn_mixin.py
index 2d8ddbd0c..6f8a17329 100644
--- a/tools/nntool/importer/onnx/handlers/backend/rnn_mixin.py
+++ b/tools/nntool/importer/onnx/handlers/backend/rnn_mixin.py
@@ -36,20 +36,20 @@ def deep_update(cls, x, y):
     @classmethod
     def extract_weights(cls, weights, hidden_size, keys, num_directions):
         return {
-            'forward' if dir == 0 else 'backward':
+            'forward' if direction == 0 else 'backward':
                 {keys[i]: arr.reshape((hidden_size, -1))
                  for i, arr in enumerate(np.split(dir_arr, len(keys), axis=1))}
-            for dir, dir_arr in enumerate(np.split(weights, num_directions, axis=0))
+            for direction, dir_arr in enumerate(np.split(weights, num_directions, axis=0))
         }
 
     @classmethod
     def extract_biases(cls, biases, hidden_size, keys, num_directions):
         biases = biases.reshape((num_directions, len(keys), hidden_size))
         return {
-            'forward' if dir == 0 else 'backward':
+            'forward' if direction == 0 else 'backward':
                 {keys[i]: arr.reshape((hidden_size))
                  for i, arr in enumerate(np.split(dir_arr, len(keys), axis=1))}
-            for dir, dir_arr in enumerate(np.split(biases, num_directions, axis=0))
+            for direction, dir_arr in enumerate(np.split(biases, num_directions, axis=0))
         }
 
     @classmethod
diff --git a/tools/nntool/importer/tflite2/handlers/backend/concatenation.py b/tools/nntool/importer/tflite2/handlers/backend/concatenation.py
index 8bea05cd9..dd10ed7fb 100644
--- a/tools/nntool/importer/tflite2/handlers/backend/concatenation.py
+++ b/tools/nntool/importer/tflite2/handlers/backend/concatenation.py
@@ -13,12 +13,13 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+from collections import Counter
 from functools import reduce
 
 import numpy as np
 from graph.dim import Dim
 from graph.types import ConcatParameters, ConstantInputParameters, NNEdge
-from graph.types.others import ReshapeParameters
+from graph.types.others import CopyParameters, ReshapeParameters
 from importer.common.constant_mixin import ConstantMixin
 from importer.common.provisional_dim import ProvisionalDim
 from importer.tflite2.common import LOG
@@ -63,10 +64,16 @@ def _common(cls, node: TFLiteNode, **kwargs):
 
         buffer_idxes = [tensor.buffer_idx for tensor in node.input]
         non_zero_idxes = [idx for idx in buffer_idxes if idx != 0]
-        if len(set(non_zero_idxes)) != len(non_zero_idxes):
-            raise NotImplementedError(
-                "concats with multiple versions of the same input are not supported. "
-                "This is normally a graph design problem.")
+        duplicates = [idx for idx, count in Counter(non_zero_idxes).items() if count > 1]
+        if duplicates:
+            LOG.warning(f'concat {node.name} has duplicate inputs. Inserting copies but this is not very efficient.')
+            for idx in duplicates:
+                dup_idxes = [i for i, x in enumerate(buffer_idxes) if x == idx]
+                for dup_idx in dup_idxes[1:]:
+                    cparams = CopyParameters(G.unique_name(f'{node.name}_dup_{dup_idxes[0]}_{dup_idx}'))
+                    dup_inp = inputs[dup_idx]
+                    G.add_edge(NNEdge(from_node=dup_inp[0], from_idx=dup_inp[1], to_node=cparams))
+                    inputs[dup_idx] = tuple([cparams, 0] + list(dup_inp[2:]))
 
         axis = node_opts.Axis()
         if any(inp_shape[axis] is None for inp_shape in inp_shapes):
diff --git a/tools/nntool/importer/tflite2/handlers/backend/fully_connected.py b/tools/nntool/importer/tflite2/handlers/backend/fully_connected.py
index ebe001ab5..7593d3a69 100644
--- a/tools/nntool/importer/tflite2/handlers/backend/fully_connected.py
+++ b/tools/nntool/importer/tflite2/handlers/backend/fully_connected.py
@@ -13,11 +13,10 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from graph.types.tensor_arithmetic import MatMulTransposedParameters
 import numpy as np
 from graph.dim import Dim, FcFilterDim
 from graph.types import (ConstantInputParameters, FcParameters,
-                         MatMulOpParameters, NNEdge, ReshapeParameters,
+                         MatMulTransposedParameters, NNEdge, ReshapeParameters,
                          TransposeParameters)
 from importer.common.provisional_dim import ProvisionalDim
 from importer.tflite2.common import check
@@ -42,13 +41,14 @@ def _common(cls, node, **kwargs):
         all_nodes = kwargs['all_nodes']
 
         keep_dims = node_opts.KeepNumDims()
-        check(not keep_dims,
-              f'keep dims on Fully Connected {node.name} is not supported')
+        # check(not keep_dims,
+        #       f'keep dims on Fully Connected {node.name} is not supported')
 
         inputs = [all_nodes[t] if t is not None else None for t in node.input]
 
         x = inputs[0]
-        x_known_shape = x[2].known_shape
+        x_shape = x[2]
+        x_known_shape = x_shape.known_shape
         inp_sz = np.prod(np.array(x_known_shape))
         weights = inputs[1]
         weights_node = weights[0]
@@ -57,6 +57,20 @@ def _common(cls, node, **kwargs):
               f'bad filter shape {weights_shape} in {node.name}')
         out_c = weights_shape[0]
         batch_size = inp_sz // weights_shape[1]
+
+        keep_dims = node_opts.KeepNumDims()
+        if keep_dims:
+            if x_shape.shape[-1] != weights_shape[1]:
+                raise ValueError(
+                    f'Keep dims set on {node.name} but last input dimension does not match weights')
+            out_shape = x_shape.shape.copy()
+            out_shape[-1] = out_c
+        elif batch_size > 1:
+            out_shape = (batch_size, out_c)
+        else:
+            out_shape = (None, out_c)
+        real_out_shape = tuple(dim for dim in out_shape if dim is not None)
+
         filt_dim = FcFilterDim(weights_shape[0], weights_shape[1])
 
         node.input[1].used = True
@@ -70,7 +84,7 @@ def _common(cls, node, **kwargs):
             bias_node = ConstantInputParameters(f'{node.name}_bias',
                                                 dims=Dim.unnamed([out_c]),
                                                 value=np.zeros([out_c],
-                                                dtype=np.float32))
+                                                               dtype=np.float32))
 
         if batch_size > 1:
             # add a reshape to force the size of the input to batch * in_c
@@ -91,12 +105,15 @@ def _common(cls, node, **kwargs):
             cls.new_load_filter_parameters(G, params, weights_shape, 0,
                                            node.input[0], weights_node,
                                            bias_node, node.output[0], opts)
-            trans2 = TransposeParameters(G.unique_name(f'{node.name}_tin2'), transpose=(1, 0))
-            G.add_edge(NNEdge(from_node=link[0], to_node=params, from_idx=link[1]))
-            G.add_edge(NNEdge(from_node=weights_node, to_node=params, to_idx=1))
+            trans2 = TransposeParameters(G.unique_name(
+                f'{node.name}_tin2'), transpose=(1, 0))
+            G.add_edge(
+                NNEdge(from_node=link[0], to_node=params, from_idx=link[1]))
+            G.add_edge(NNEdge(from_node=weights_node,
+                              to_node=params, to_idx=1))
             #G.add_edge(NNEdge(from_node=trans2, to_node=params, to_idx=1))
             G.add_edge(NNEdge(from_node=bias_node, to_node=params, to_idx=2))
-            out_shape = [batch_size, out_c]
+            fc_shape = (batch_size, out_c)
         else:
             ker_in_order = None
             ker_out_order = None
@@ -116,12 +133,17 @@ def _common(cls, node, **kwargs):
             G.add_edge(NNEdge(from_node=bias_node, to_node=params, to_idx=2))
             G.add_edge(NNEdge(from_node=link[0], to_node=params,
                               from_idx=link[1], to_idx=0))
-
-            out_shape = [None, out_c]
+            fc_shape = (out_c,)
 
         pout_dims = ProvisionalDim(out_shape)
-
         aparams = cls.fuse_activation(node_opts, node.name, params, **kwargs)
+
+        if real_out_shape != fc_shape:
+            rparams = ReshapeParameters(G.unique_name(f'{node.name}_keepdims'),
+                                        old_shape=fc_shape, shape=real_out_shape)
+            G.add_edge(NNEdge(from_node=aparams, to_node=rparams))
+            aparams = rparams
+
         all_nodes[node.output[0]] = (aparams, 0, pout_dims)
         return params
 
diff --git a/tools/nntool/interpreter/commands/adjust.py b/tools/nntool/interpreter/commands/adjust.py
index 73e04b5f9..f03975ac8 100644
--- a/tools/nntool/interpreter/commands/adjust.py
+++ b/tools/nntool/interpreter/commands/adjust.py
@@ -16,19 +16,36 @@
 from cmd2 import Cmd2ArgumentParser, with_argparser
 from interpreter.nntool_shell_base import NNToolShellBase
 
+
 class AdjustCommand(NNToolShellBase):
     # ADJUST COMMAND
     parser_adjust = Cmd2ArgumentParser()
-    parser_adjust.add_argument('-n', '--no_postprocess',
-                               action='store_true', help='Don\'t try to eliminate transposes')
-    parser_adjust.add_argument('-o', '--one_cycle',
-                               action='store_true', help='Do one cycle of post processing')
+    parser_adjust_choices = parser_adjust.add_mutually_exclusive_group()
+    parser_adjust_choices.add_argument(
+        '-n', '--no_postprocess',
+        action='store_true', help='Don\'t try to eliminate transposes')
+    parser_adjust_choices.add_argument(
+        '-o', '--one_step',
+        action='store_true', help='Do one step of post processing')
+    parser_adjust_choices.add_argument(
+        '-s', '--steps',
+        type=int, help='Do fixed number of steps of post processing')
+    parser_adjust_choices.add_argument(
+        '-i', '--individual_step',
+        action='store_true', help='Do the post processing step by step')
+
     @with_argparser(parser_adjust)
     def do_adjust(self, args):
         """
 Adjust activation and parameter tensors to match AutoTiler order.
 Must be run before generating code."""
         self._check_graph()
-        self.G.adjust_order(postprocess=not args.no_postprocess, one_cycle=args.one_cycle)
+        if args.one_step:
+            steps = 1
+        elif args.steps:
+            steps = args.steps
+        else:
+            steps = None
+        self.G.adjust_order(
+            postprocess=not args.no_postprocess, steps=steps, single_step=args.individual_step)
         self.G.add_dimensions()
-        
\ No newline at end of file
diff --git a/tools/nntool/quantization/float/quantizers/pool_float.py b/tools/nntool/quantization/float/quantizers/pool_float.py
new file mode 100644
index 000000000..aeb32042d
--- /dev/null
+++ b/tools/nntool/quantization/float/quantizers/pool_float.py
@@ -0,0 +1,52 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
+from quantization.quantizer_options import HWC_OPTION
+import numpy as np
+from bfloat16 import bfloat16
+from graph.types import PoolingParameters
+from quantization.float.float_quantization_handler import \
+    FloatQuantizionHandler
+from quantization.new_qrec import QRec
+from quantization.qtype import QType
+from quantization.qtype_constraint import MatchAll
+from quantization.unified_quantization_handler import (in_qs_constraint,
+                                                       out_qs_constraint,
+                                                       params_type,
+                                                       options)
+
+
+@options(
+    HWC_OPTION
+)
+@params_type(PoolingParameters)
+@in_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16])}))
+@out_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16])}))
+class FloatDefault(FloatQuantizionHandler):
+    @classmethod
+    def _quantize(cls, params, in_qs, stats, **kwargs):
+        force_out_qs, dtype = cls.get_float_opts(**kwargs)
+        if force_out_qs and any(qtype.dtype != dtype for qtype in force_out_qs if qtype is not None):
+            return None
+        opts = kwargs['opts']
+        if opts['hwc']:
+            cls.check_order(params, [['h', 'w', 'c']], [['h', 'w', 'c']])
+        elif params.in_dims_hint:
+            cls.check_order(params, [['c', 'h', 'w']], [['c', 'h', 'w']])
+        # all inputs and outputs are set to the required float type
+        return QRec.float(in_qs=[QType(dtype=dtype)],
+                          out_qs=[QType(dtype=dtype)],
+                          float_dtype=dtype)
diff --git a/utils/bin/binary-size b/utils/bin/binary-size
index 1e05ee214..8918afc8d 100755
--- a/utils/bin/binary-size
+++ b/utils/bin/binary-size
@@ -110,6 +110,7 @@ groups = {
             Group('PulpOS:kernel', ['pos_kernel.*', 'pos_soc_.*', 'pos_cbsys_.*']),
             Group('PulpOS:fll', ['pos_fll.*', 'pos_freq.*', 'pi_freq_.*']),
             Group('PulpOS:cpi', ['pos_cpi.*', 'pi_cpi_.*']),
+            Group('PulpOS:i2s', ['pos_i2s.*', 'pi_i2s_.*']),
             Group('PulpOS:mram', ['pos_mram.*']),
             Group('PulpOS:rtc', ['pos_rtc.*']),
             Group('PulpOS:cluster', ['pos_cluster.*', 'pi_cluster_.*']),
diff --git a/utils/gap_configs/python/ips/iss/iss.py b/utils/gap_configs/python/ips/iss/iss.py
index ade8b4ad1..d19fbe1ee 100644
--- a/utils/gap_configs/python/ips/iss/iss.py
+++ b/utils/gap_configs/python/ips/iss/iss.py
@@ -34,7 +34,9 @@ class Iss(st.Component):
     riscv_dbg_unit : bool, optional
         True if a riscv debug unit should be included, False otherwise (default: False).
     debug_binaries : list, optional
-        A list of path to riscv binaries which can be used to get debug symbols for the assembly trace (default: []).
+        A list of path to riscv binaries debug info which can be used to get debug symbols for the assembly trace (default: []).
+    binaries : list, optional
+        A list of path to riscv binaries (default: []).
     debug_handler : int, optional
         The address where the core should jump when switching to debug mode (default: 0).
     power_models : dict, optional
@@ -62,6 +64,7 @@ def __init__(self,
             first_external_pcer: int=0,
             riscv_dbg_unit: bool=False,
             debug_binaries: list=[],
+            binaries: list=[],
             debug_handler: int=0,
             power_models: dict={},
             power_models_file: str=None,
@@ -82,6 +85,7 @@ def __init__(self,
             'first_external_pcer': first_external_pcer,
             'riscv_dbg_unit': riscv_dbg_unit,
             'debug_binaries': debug_binaries,
+            'binaries': binaries,
             'debug_handler': debug_handler,
             'power_models': power_models,
             'cluster_id': cluster_id,
diff --git a/utils/rules/pmsis_rules.mk b/utils/rules/pmsis_rules.mk
index ce9288edc..73c6c59be 100644
--- a/utils/rules/pmsis_rules.mk
+++ b/utils/rules/pmsis_rules.mk
@@ -421,4 +421,4 @@ profiler:
 	cd $(BUILDDIR) && export PULP_CONFIG_FILE=$(BUILDDIR)/gvsoc_config.json && profiler $(BUILDDIR) $(BIN) gvsoc_config.json --signal-tree-file=$(PROFILER_SIGNAL_TREE)
 
 size:
-	@$(GAP_SDK_HOME)/utils/bin/binary-size --binary=$(BIN) --depth=10 --groups=$(PMSIS_OS) >> $(BIN).size
+	$(GAP_SDK_HOME)/utils/bin/binary-size --binary=$(BIN) --depth=10 --groups=$(PMSIS_OS)

From db984bc652efaaca0caaece17dc1270bf8a13685 Mon Sep 17 00:00:00 2001
From: yao <yao.zhang@greenwaves-technologies.com>
Date: Mon, 20 Dec 2021 16:08:53 +0100
Subject: [PATCH 3/3] Remove unused files

---
 tools/autotiler_v3_get/LICENSE           | 204 --------------
 tools/autotiler_v3_get/Makefile          |  25 --
 tools/autotiler_v3_get/download_tiler.sh |  33 ---
 tools/autotiler_v3_get/get_tiler.py      | 335 -----------------------
 4 files changed, 597 deletions(-)
 delete mode 100644 tools/autotiler_v3_get/LICENSE
 delete mode 100644 tools/autotiler_v3_get/Makefile
 delete mode 100755 tools/autotiler_v3_get/download_tiler.sh
 delete mode 100644 tools/autotiler_v3_get/get_tiler.py

diff --git a/tools/autotiler_v3_get/LICENSE b/tools/autotiler_v3_get/LICENSE
deleted file mode 100644
index e8567dbda..000000000
--- a/tools/autotiler_v3_get/LICENSE
+++ /dev/null
@@ -1,204 +0,0 @@
-LICENSE AGREEMENT FOR USE OF THE AUTOTILER
-
-Definitions 
-“Affiliate(s)” of a person means any entity that Controls, is Controlled by, or
-is under common Control with such person, where “Control” of a person or 
-entity (for purposes of this definition only) means the possession, directly 
-or indirectly, of the power to direct or cause the direction of the 
-management, operating policies, or assets of that person or entity, whether by
-way of ownership of more than 50% of its voting or equity securities or
-assets or by way of contract, management agreement, voting trust, or
-otherwise.
-“Authorized User(s)” means the individuals You authorize to access the Software,
-including Your employees or third parties that access the Software solely on
-Your behalf for Your internal and/or personal operations.
-“GreenWaves Technologies” “we,” “our” or “us” means GreenWaves Technologies SAS,
-a company incorporated under the laws of France, and registered with the Trade
-and Companies Registry of Grenoble, under number 808 076 582, having its
-principal place of business at 28 cours Jean Jaurès, 38000 Grenoble, France.
-“Documentation” means the GreenWaves Technologies user or technical manuals,
-specifications, privacy data sheets or other information applicable to the
-Software.
-“Personal Data” means any information that can be used to identify an individual
-and is required in order to enable You to download the Software, i.e Your name
-and email address, as well as Your company name.
-“Software” means Greenwaves Technologies’ software Autotiler which is a
-downloadable file enabling easier programming of GreenWaves Technologies’ GAP
-products by automatic generation of code for memory tiling and transfers between
-memory levels. 
-“You” and “Your” means the individual or legal entity downloading the Software.
-
-1. Delivery 
-The Software is deemed to be delivered and accepted by You at the date it is
-made available for download by You.  By downloading the Software, You agree to
-be bound by the terms of this EULA. If You do not have the authority to enter
-into this EULA or You do not agree with its terms, do not Use the Software and
-uninstall it.
-
-2. Scope of the license
-Subject to Your compliance with this EULA, GreenWaves Technologies grants You a
-worldwide, non-exclusive and non-transferable license to use the Software and
-related Documentation, as well as the code automatically generated by the
-Software for memory tiling and transfers between memory levels, for Your
-internal and/or personal use in accordance with the terms set forth, solely
-in order to program GreenWaves Technologies’ products, or to execute or simulate
-software on another system with a final objective of developing code to be used
-on GreenWaves Technologies' product. GreenWaves Technologies also grants You a
-worldwide, non-exclusive and non-transferable license to compile the code
-automatically generated by the Software, for the exclusive purpose of being run
-on a GreenWaves Technologies’ product as embedded in Your product. Free or open
-source software not owned by GreenWaves Technologies is subject to separate
-license terms made available with the Software Development Kit or at
-https://github.com/GreenWaves-Technologies/gap_sdk. 
- 
-3. Ownership
-GreenWaves Technologies retains exclusive ownership in all intellectual property
-rights in and to the Software and all underlying technologies and associated
-Documentation related thereto.
-
-4. Limitations and Restrictions
-Unless expressly authorized by GreenWaves Technologies in writing or otherwise
-permitted under applicable law, You shall not: 
-sell, resell, transfer, sublicense, or assign Your rights under this license;
-modify, adapt or create derivative works; 
-reverse engineer, decompile, decrypt, disassemble or otherwise attempt to derive
-the source code;
-make the functionality available to third parties, whether as an application
-service provider, or on an outsourcing, membership or subscription, rental,
-service bureau, cloud service, managed or hosted service, or other similar
-basis; 
-use Software that is licensed for the development of code to run products other
-than GreenWaves Technologies’ GAP products; 
-remove, modify, or conceal any product identification, copyright, proprietary,
-intellectual property notices or other marks from the Software and related
-Documentation, as well as from the code automatically generated by the Software.
-
-5. Use by Authorized Users
-You may allow Authorized Users to use the Software solely on Your behalf for
-Your internal and/or personal operations. You are responsible for ensuring that
-Authorized Users comply with the terms of this EULA and You are liable for any
-breach of the same by such Authorized Users.
-
-6. Limited Warranty 
-To the extent permitted by applicable law, You expressly acknowledge and agree
-that the Software, as well as the code it automatically generates, are provided
-“AS-IS”, used at Your sole risk and without support or any express or implied
-warranty of any kind or indemnity for any problems or issues. GreenWaves
-Technologies hereby disclaims all warranties and conditions with respect to the
-Software, either express, implied or statutory, including, but not limited to,
-the implied warranties and/or conditions of merchantability, satisfactory
-quality, fitness for a particular purpose, accuracy, quiet enjoyment, and
-non-infringement of third party rights.
-GreenWaves Technologies does not warrant against interference with your
-enjoyment of the Software, that the functions contained in, or services
-performed or provided by, the Software will meet your requirements, that the
-operation of the Software will be uninterrupted or error-free, that defects in
-the Software will be corrected, or that the Software will be compatible or work
-with any third party software, applications or third party services other than
-those expressly identified by GreenWaves Technologies. 
-You further acknowledge that the Software is not intended or suitable for use
-in situations or environments where the failure or time delays of, or errors or
-inaccuracies in, the content, data or information provided by the Software could
-lead to death, personal injury, or severe physical or environmental damage,
-including without limitation the operation of nuclear facilities, aircraft
-navigation or communication systems, air traffic control, life support or
-weapons systems.
-
-7. Limitations of Liability
-To the extent not prohibited by applicable law, in no event shall GreenWaves
-Technologies, its Affiliates, agents, employees or principals, be liable for the
-following, regardless of the theory of liability or whether relating to or
-arising out of this EULA, the Software or otherwise, even if a party has been
-advised of the possibility of such damages: 
-indirect, incidental, exemplary, special or consequential damages; 
-loss or corruption of data or interrupted or loss of business; or 
-loss of revenue, profits, goodwill or anticipated sales or savings. 
-Some jurisdictions do not allow the exclusion or limitation of liability for
-personal injury, or of incidental or consequential damages, so this limitation
-may not apply to You. 
-To the extent permitted by applicable law, all liability of GreenWaves
-Technologies, its Affiliates, agents, employees, collectively, to You, whether
-based in warranty, contract, tort (including negligence), or otherwise (other
-than as may be required by applicable law in cases involving personal injury),
-shall not exceed, in the aggregate, the total amount of one thousand euros
-(€1,000). This limitation of liability for the Software is cumulative and not
-per incident. 
-Nothing in this EULA limits or excludes any liability that cannot be limited or
-excluded under applicable law.
-
-8. Term and Termination
-Your license begins on the date the Software is downloaded and continues until
-terminated. 
-To the extent permitted under applicable law, GreenWaves Technologies has
-the right to immediately suspend or terminate Your use of the Software if
-You breach Section 2, 4 or 5 of this EULA. Upon termination of the EULA,
-You must cease any further use of the Software, and destroy any copies of
-Software within Your control. 
-
-9. Personal Data
-GreenWaves Technologies processes and uses Personal Data in order to enable You
-to download the Software and keep a track record of the persons who are bound
-by the present EULA. If You have accepted to receive news on GreenWaves
-Technologies products, You will receive emails with updates, new versions and
-new products available. 
-Your Personal Data is not sold or otherwise transmitted to any third party. It
-is stored by GreenWaves Technologies for a period of 10 years further to Your
-download of the Software. If You have subscribed to GreenWaves Technologies
-newsletter, Your Personal Data will be stored as long as You do not unsubscribe
-from this service. 
-GreenWaves Technologies will maintain appropriate administrative, physical and
-technical safeguards, which are designed to protect the security,
-confidentiality and integrity of Personal Data processed by GreenWaves
-Technologies. 
-To the extent required by applicable law, in particular EU Regulation 2016/679
-relating to General Data Protection Rules, You may have the right to access
-certain Personal Data we process, request that we update or modify such Personal
-Data when inaccurate, object or restrict to our use of You Personal Data,
-withdraw Your consent at any time when GreenWaves Technologies processes Your
-Personal Data based on your consent, and request that we delete You Personal
-Data, by writing to websales@greenwaves-technologies.com. 
-If You think that the way GreenWaves Technologies processes Your Personal Data
-does not comply with applicable data protection laws, you can contact the
-relevant competent data protection authority. You can obtain the information by
-contacting EU data protection authorities at
-http://ec.europa.eu/justice/article-29/structure/data-protection-authorities/index_en.htm. 
-
-10. Export Controls
-You may not use or otherwise export or re-export the Software except as
-authorized by applicable laws and the laws of the jurisdictions in which the
-Software was obtained.
-
-11. Force Majeure
-Neither party will be responsible for failure of performance due to an event
-that is unforeseeable and beyond the affected party’s reasonable control,
-including accidents, severe weather events, acts of God, actions of any
-government agency, pandemic, acts of terrorism, or the stability or availability
-of the Internet or portions thereof.
-
-12. Complete Agreement
-If any portion of this EULA is found to be void or unenforceable, the remaining
-provisions of the EULA shall remain in full force and effect. Except as
-expressly stated or as expressly amended in a signed agreement, the EULA is the
-complete agreement between the parties with respect to the Software and
-supersedes all prior or contemporaneous communications, understandings or
-agreements (whether written or oral) regarding this subject matter. 
-To the extent permitted by applicable law, the parties agree that the English
-version of the EULA will govern in the event of a conflict between it and any
-version translated into another language.
-
-
-
-13. Notices
-GreenWaves Technologies may provide You with notice via email and/or postings on
-the GreenWaves Technologies.com website. 
-Notices to GreenWaves Technologies should be sent to GreenWaves Technologies, 28
-cours Jean Jaurès, 38000 GRENOBLE websales@greenwaves-technologies.com.
-
-14. Dispute Settlement and Governing Laws
-The validation, interpretation, modification, fulfillment, and dispute
-settlement of this EULA are governed by the laws of 	France, without regard
-to conflict of law principles. If any dispute arises concerning the content or
-performance of this EULA, the dispute shall be settled through amicable
-settlement. In the event that the dispute cannot be settled through negotiation,
-either party could submit the dispute to the French courts.
-
diff --git a/tools/autotiler_v3_get/Makefile b/tools/autotiler_v3_get/Makefile
deleted file mode 100644
index dc12e7fd7..000000000
--- a/tools/autotiler_v3_get/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-TILER_VER=4.3.0
-export TILER_LIB=libtile.${TILER_VER}.a
-ifdef GAP_SDK_HOME
-export TILER_URL=$(GAP_SDK_HOME)/.tiler_url
-else
-export TILER_URL=.tiler_url
-endif
-
-all: Autotiler/LibTile.a
-
-clean:
-	rm -rf Autotiler/LibTile*
-	rm -f $(TILER_URL)
-
-ifeq (,$(wildcard $(TILER_URL)))
-$(TILER_URL): get_tiler.py
-	python3 get_tiler.py
-endif
-
-Autotiler/LibTile.a: $(TILER_URL)
-	mkdir -p Autotiler
-	rm -rf Autotiler/LibTile*
-	./download_tiler.sh
-
-.PHONY: all clean
diff --git a/tools/autotiler_v3_get/download_tiler.sh b/tools/autotiler_v3_get/download_tiler.sh
deleted file mode 100755
index 0ca2e7b46..000000000
--- a/tools/autotiler_v3_get/download_tiler.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/env bash
-
-if [ -z "$TILER_LICENSE_AGREED" ]
-then
-	echo "Please read carefully the autotiler agreement before proceeding with the download."
-	read -n 1 -s -r -p "Press enter when you are ready to display the license."
-
-	more LICENSE
-
-	while [ true ] ; do
-	read -n 1 -p "Do you agree with the license (y/n) ? " key
-
-	echo
-
-	if [ "$key" == "y" ]
-	then
-	break
-	fi
-
-	if [ "$key" == "n" ]
-	then
-	exit 0
-	fi
-
-	done
-fi
-
-echo ${TILER_LIB} | wget --no-use-server-timestamps --base=`cat $TILER_URL` --input-file=- -O Autotiler/LibTile.a
-if [ $? != 0 ]; then
-	rm $TILER_URL
-	rm -f Autotiler/LibTile.a
-	exit 1
-fi
diff --git a/tools/autotiler_v3_get/get_tiler.py b/tools/autotiler_v3_get/get_tiler.py
deleted file mode 100644
index 34f1ec57c..000000000
--- a/tools/autotiler_v3_get/get_tiler.py
+++ /dev/null
@@ -1,335 +0,0 @@
-import requests, json, sys, re, os
-
-class bcolors:
-    HEADER = '\033[95m'
-    OKBLUE = '\033[94m'
-    OKGREEN = '\033[92m'
-    WARNING = '\033[93m'
-    FAIL = '\033[91m'
-    ENDC = '\033[0m'
-    BOLD = '\033[1m'
-    UNDERLINE = '\033[4m'
-
-Countries = [
-    ('US', 'United States'),
-    ('AF', 'Afghanistan'),
-    ('AL', 'Albania'),
-    ('DZ', 'Algeria'),
-    ('AS', 'American Samoa'),
-    ('AD', 'Andorra'),
-    ('AO', 'Angola'),
-    ('AI', 'Anguilla'),
-    ('AQ', 'Antarctica'),
-    ('AG', 'Antigua And Barbuda'),
-    ('AR', 'Argentina'),
-    ('AM', 'Armenia'),
-    ('AW', 'Aruba'),
-    ('AU', 'Australia'),
-    ('AT', 'Austria'),
-    ('AZ', 'Azerbaijan'),
-    ('BS', 'Bahamas'),
-    ('BH', 'Bahrain'),
-    ('BD', 'Bangladesh'),
-    ('BB', 'Barbados'),
-    ('BY', 'Belarus'),
-    ('BE', 'Belgium'),
-    ('BZ', 'Belize'),
-    ('BJ', 'Benin'),
-    ('BM', 'Bermuda'),
-    ('BT', 'Bhutan'),
-    ('BO', 'Bolivia'),
-    ('BA', 'Bosnia And Herzegowina'),
-    ('BW', 'Botswana'),
-    ('BV', 'Bouvet Island'),
-    ('BR', 'Brazil'),
-    ('BN', 'Brunei Darussalam'),
-    ('BG', 'Bulgaria'),
-    ('BF', 'Burkina Faso'),
-    ('BI', 'Burundi'),
-    ('KH', 'Cambodia'),
-    ('CM', 'Cameroon'),
-    ('CA', 'Canada'),
-    ('CV', 'Cape Verde'),
-    ('KY', 'Cayman Islands'),
-    ('CF', 'Central African Rep'),
-    ('TD', 'Chad'),
-    ('CL', 'Chile'),
-    ('CN', 'China'),
-    ('CX', 'Christmas Island'),
-    ('CC', 'Cocos Islands'),
-    ('CO', 'Colombia'),
-    ('KM', 'Comoros'),
-    ('CG', 'Congo'),
-    ('CK', 'Cook Islands'),
-    ('CR', 'Costa Rica'),
-    ('CI', 'Cote D`ivoire'),
-    ('HR', 'Croatia'),
-    ('CU', 'Cuba'),
-    ('CY', 'Cyprus'),
-    ('CZ', 'Czech Republic'),
-    ('DK', 'Denmark'),
-    ('DJ', 'Djibouti'),
-    ('DM', 'Dominica'),
-    ('DO', 'Dominican Republic'),
-    ('TP', 'East Timor'),
-    ('EC', 'Ecuador'),
-    ('EG', 'Egypt'),
-    ('SV', 'El Salvador'),
-    ('GQ', 'Equatorial Guinea'),
-    ('ER', 'Eritrea'),
-    ('EE', 'Estonia'),
-    ('ET', 'Ethiopia'),
-    ('FK', 'Falkland Islands (Malvinas)'),
-    ('FO', 'Faroe Islands'),
-    ('FJ', 'Fiji'),
-    ('FI', 'Finland'),
-    ('FR', 'France'),
-    ('GF', 'French Guiana'),
-    ('PF', 'French Polynesia'),
-    ('TF', 'French S. Territories'),
-    ('GA', 'Gabon'),
-    ('GM', 'Gambia'),
-    ('GE', 'Georgia'),
-    ('DE', 'Germany'),
-    ('GH', 'Ghana'),
-    ('GI', 'Gibraltar'),
-    ('GR', 'Greece'),
-    ('GL', 'Greenland'),
-    ('GD', 'Grenada'),
-    ('GP', 'Guadeloupe'),
-    ('GU', 'Guam'),
-    ('GT', 'Guatemala'),
-    ('GN', 'Guinea'),
-    ('GW', 'Guinea-bissau'),
-    ('GY', 'Guyana'),
-    ('HT', 'Haiti'),
-    ('HN', 'Honduras'),
-    ('HK', 'Hong Kong'),
-    ('HU', 'Hungary'),
-    ('IS', 'Iceland'),
-    ('IN', 'India'),
-    ('ID', 'Indonesia'),
-    ('IR', 'Iran'),
-    ('IQ', 'Iraq'),
-    ('IE', 'Ireland'),
-    ('IL', 'Israel'),
-    ('IT', 'Italy'),
-    ('JM', 'Jamaica'),
-    ('JP', 'Japan'),
-    ('JO', 'Jordan'),
-    ('KZ', 'Kazakhstan'),
-    ('KE', 'Kenya'),
-    ('KI', 'Kiribati'),
-    ('KP', 'Korea (North)'),
-    ('KR', 'Korea (South)'),
-    ('KW', 'Kuwait'),
-    ('KG', 'Kyrgyzstan'),
-    ('LA', 'Laos'),
-    ('LV', 'Latvia'),
-    ('LB', 'Lebanon'),
-    ('LS', 'Lesotho'),
-    ('LR', 'Liberia'),
-    ('LY', 'Libya'),
-    ('LI', 'Liechtenstein'),
-    ('LT', 'Lithuania'),
-    ('LU', 'Luxembourg'),
-    ('MO', 'Macau'),
-    ('MK', 'Macedonia'),
-    ('MG', 'Madagascar'),
-    ('MW', 'Malawi'),
-    ('MY', 'Malaysia'),
-    ('MV', 'Maldives'),
-    ('ML', 'Mali'),
-    ('MT', 'Malta'),
-    ('MH', 'Marshall Islands'),
-    ('MQ', 'Martinique'),
-    ('MR', 'Mauritania'),
-    ('MU', 'Mauritius'),
-    ('YT', 'Mayotte'),
-    ('MX', 'Mexico'),
-    ('FM', 'Micronesia'),
-    ('MD', 'Moldova'),
-    ('MC', 'Monaco'),
-    ('MN', 'Mongolia'),
-    ('MS', 'Montserrat'),
-    ('MA', 'Morocco'),
-    ('MZ', 'Mozambique'),
-    ('MM', 'Myanmar'),
-    ('NA', 'Namibia'),
-    ('NR', 'Nauru'),
-    ('NP', 'Nepal'),
-    ('NL', 'Netherlands'),
-    ('AN', 'Netherlands Antilles'),
-    ('NC', 'New Caledonia'),
-    ('NZ', 'New Zealand'),
-    ('NI', 'Nicaragua'),
-    ('NE', 'Niger'),
-    ('NG', 'Nigeria'),
-    ('NU', 'Niue'),
-    ('NF', 'Norfolk Island'),
-    ('MP', 'Northern Mariana Islands'),
-    ('NO', 'Norway'),
-    ('OM', 'Oman'),
-    ('PK', 'Pakistan'),
-    ('PW', 'Palau'),
-    ('PA', 'Panama'),
-    ('PG', 'Papua New Guinea'),
-    ('PY', 'Paraguay'),
-    ('PE', 'Peru'),
-    ('PH', 'Philippines'),
-    ('PN', 'Pitcairn'),
-    ('PL', 'Poland'),
-    ('PT', 'Portugal'),
-    ('PR', 'Puerto Rico'),
-    ('QA', 'Qatar'),
-    ('RE', 'Reunion'),
-    ('RO', 'Romania'),
-    ('RU', 'Russian Federation'),
-    ('RW', 'Rwanda'),
-    ('KN', 'Saint Kitts And Nevis'),
-    ('LC', 'Saint Lucia'),
-    ('VC', 'St Vincent/Grenadines'),
-    ('WS', 'Samoa'),
-    ('SM', 'San Marino'),
-    ('ST', 'Sao Tome'),
-    ('SA', 'Saudi Arabia'),
-    ('SN', 'Senegal'),
-    ('SC', 'Seychelles'),
-    ('SL', 'Sierra Leone'),
-    ('SG', 'Singapore'),
-    ('SK', 'Slovakia'),
-    ('SI', 'Slovenia'),
-    ('SB', 'Solomon Islands'),
-    ('SO', 'Somalia'),
-    ('ZA', 'South Africa'),
-    ('ES', 'Spain'),
-    ('LK', 'Sri Lanka'),
-    ('SH', 'St. Helena'),
-    ('PM', 'St.Pierre'),
-    ('SD', 'Sudan'),
-    ('SR', 'Suriname'),
-    ('SZ', 'Swaziland'),
-    ('SE', 'Sweden'),
-    ('CH', 'Switzerland'),
-    ('SY', 'Syrian Arab Republic'),
-    ('TW', 'Taiwan'),
-    ('TJ', 'Tajikistan'),
-    ('TZ', 'Tanzania'),
-    ('TH', 'Thailand'),
-    ('TG', 'Togo'),
-    ('TK', 'Tokelau'),
-    ('TO', 'Tonga'),
-    ('TT', 'Trinidad And Tobago'),
-    ('TN', 'Tunisia'),
-    ('TR', 'Turkey'),
-    ('TM', 'Turkmenistan'),
-    ('TV', 'Tuvalu'),
-    ('UG', 'Uganda'),
-    ('UA', 'Ukraine'),
-    ('AE', 'United Arab Emirates'),
-    ('UK', 'United Kingdom'),
-    ('UY', 'Uruguay'),
-    ('UZ', 'Uzbekistan'),
-    ('VU', 'Vanuatu'),
-    ('VA', 'Vatican City State'),
-    ('VE', 'Venezuela'),
-    ('VN', 'Viet Nam'),
-    ('VG', 'Virgin Islands (British)'),
-    ('VI', 'Virgin Islands (U.S.)'),
-    ('EH', 'Western Sahara'),
-    ('YE', 'Yemen'),
-    ('YU', 'Yugoslavia'),
-    ('ZR', 'Zaire'),
-    ('ZM', 'Zambia'),
-    ('ZW', 'Zimbabwe')
-]
-
-def fatal_error(error):
-    print (bcolors.FAIL + 'FATAL ERROR: ' + str(error) + bcolors.ENDC)
-    sys.exit(1)
-
-def download_file(url):
-    local_filename = url.split('/')[-1]
-    r = requests.get(url, stream=True, timeout=3)
-    r.raise_for_status()
-    with open(local_filename, 'wb') as f:
-        for chunk in r.iter_content(chunk_size=1024): 
-            if chunk:
-                f.write(chunk)
-    return local_filename
-
-ri = vars(__builtins__).get('raw_input',input)
-print("Registration is required to load the GAP8 AutoTiler library\n")
-
-print("In case you have already registered, you can directly enter the link")
-print("to the AutoTiler library that you received from an email.")
-print("Otherwise just press Enter.")
-url = ri("Enter URL from email: ")
-
-if url == '':
-    print("You will be prompted for your name, company and email address and the")
-    print("link for the AutoTiler libray will be sent to your email address.")
-    print("This information is used uniquely to keep track of AutoTiler users.")
-    forename = ri("Enter your first name: ")
-    surname = ri("Enter your last name: ")
-    company = ri("Enter your company name: ")
-
-    while True:
-        country = ri("Enter your country: ")
-        country = country.upper()
-        matches = []
-        for c in Countries:
-            if c[0] == country or c[1].upper() == country:
-                matches = [c]
-                break
-            elif c[1].upper().startswith(country):
-                matches.append(c)
-        if len(matches) == 1:
-            country = matches[0][0]
-            break
-        elif len(matches) > 1:
-            print("Do you mean:")
-            for c in matches:
-                print("{} ({})".format(c[1], c[0]))
-            print()
-        else:
-            print("I don't know that country. Please enter a valid country name or ISO code.")
-
-    print("Country ", country)
-
-    while True:
-        email = ri("Enter your email address: ")
-        if re.match(r"[^@]+@[^@]+\.[^@]+", email):
-            break
-        else:
-            print("Please enter a valid email address")
-
-    url = 'https://hooks.zapier.com/hooks/catch/2624512/e6qico/'
-    payload = { 'forename': forename, 'surname': surname, 'company': company, 'email': email, 'country': country }
-    headers = { 'content-type': 'application/json' }
-
-    print("Triggering email ... please wait")
-    try:
-        response = requests.post(url, data=json.dumps(payload), headers=headers, timeout=3)
-        response.raise_for_status()
-    except requests.exceptions.RequestException as ex:
-        fatal_error(ex)
-
-
-    print("Please check your email and copy and paste the URL in the email below")
-    print("Please keep this URL, you will ask to enter it in case you")
-    print("install again the SDK.")
-    url = ri("Enter URL from email: ")
-try:
-    while True:
-        if re.match(r"^https://.*/$", url):
-            break
-        else:
-            print("Please enter a valid URL, it must start with https:// and")
-            print("end with /.")
-            url = ri("Enter URL from email: ")
-    f = open(os.environ.get('TILER_URL'),"w+")
-    f.write(url)
-    f.close()
-except:
-    fatal_error("problem writing file " + os.environ.get('TILER_URL'))
\ No newline at end of file