From 1127a6040662ed4548aac4b479308a6e793cd071 Mon Sep 17 00:00:00 2001
From: Lisa Ong <onglisa@microsoft.com>
Date: Fri, 24 Mar 2023 12:03:35 +0800
Subject: [PATCH] Squashed commit of the following:

commit 40dffe83929973c8e205c395be5db23c360c2397
Author: Denny Sun <dennys@microsoft.com>
Date:   Thu Mar 23 05:06:51 2023 +0000

    Merged PR 3176: [Accera] split_dim op supports dynamic dims with static split size

    With this fix the following test case which has dynamic dims with static split size can succeed.

    ```
            M, MN = create_dimensions()
            N = 16

            Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(MN,))
            Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(M, N))

            nest = Nest(shape=(M, N))
            i, j = nest.get_indices()

            @nest.iteration_logic
            def _():
                split_input = Input._split_dimension(0, cast(16, ScalarType.index))
                Output[i, j] = split_input[i, j]
    ```

commit 451b67405d77ebbe1cf0722f9c7aeb191c3b4beb
Author: Mason Remy <masonr@microsoft.com>
Date:   Thu Mar 23 01:19:37 2023 +0000

    Merged PR 3174: Ensure any dynamic allocations are heap allocs that get dealloced

    Ensure any dynamic allocations are heap allocs

commit 602b068f19cdf1ff9111aacbeb5d704974521ebd
Author: Kern Handa <kerha@microsoft.com>
Date:   Wed Mar 22 20:59:43 2023 +0000

    Merged PR 3171: [test] Add some tests for Dimensions

commit ccd1f5c39964fbe96815672f137b5185ee2e9885
Author: Mason Remy <masonr@microsoft.com>
Date:   Wed Mar 22 19:41:02 2023 +0000

    Merged PR 3175: Support reinterpret cast of same bitwidth without changing layout

    Support reinterpret cast of same bitwidth without changing layout

commit 270a3c8a9c1e1c06b0da3c9d61fa2f04438e3076
Author: Kern Handa <kerha@microsoft.com>
Date:   Fri Mar 17 22:16:08 2023 +0000

    Merged PR 3167: Remove hack to treat INPUT_OUTPUT Arrays with shape (1,) as Elements

    I don't have complete context on this, so this might break something. If it does, that should be fixed separately rather than keep this hack around, which breaks semantics in non-obvious ways.

commit efcff61727c64e7f0a37f4f92c701bc47ea1c470
Author: Lisa Ong <onglisa@microsoft.com>
Date:   Fri Mar 17 08:09:07 2023 +0000

    Merged PR 3165: [build] Fix clang 14 release build warnings treated as errors on macOS/Apple

    Errors are showing up on release builds:

    ```
    cmake .. -DCMAKE_BUILD_TYPE=Release -G Ninja
    cmake --build . --config Release
    ```

    Clang version:
    ```
    Apple clang version 14.0.0 (clang-1400.0.29.202)
    Target: arm64-apple-darwin22.3.0
    Thread model: posix
    ```

commit 43f311aa706214243ce8d7acca7d29993bb7003b
Author: Lisa Ong <onglisa@microsoft.com>
Date:   Fri Mar 17 07:02:09 2023 +0000

    Merged PR 3162: Bump vcpkg to latest release

    Last release was Sept 2022. Update to the latest tag (2023.02.24)

    Preparation for LLVM 15 upgrade

commit 07098f502596d997bbe241e95f1130c11e318220
Author: Mason Remy <masonr@microsoft.com>
Date:   Thu Mar 16 23:27:04 2023 +0000

    Merged PR 3161: Fix cache reduce scale constant hoisting

    Fix cache reduce scale constant hoisting

commit 696ef0df5947067f94b64255e00b7fffc4c04f9d
Author: Mason Remy <masonr@microsoft.com>
Date:   Thu Mar 16 20:54:22 2023 +0000

    Merged PR 3163: Extend vector masked loads/stores to handle arbitrary bin ops and constant operands

    Extend vector masked loads/stores to handle arbitrary bin ops and
    constant operands
---
 CMakeLists.txt                                |  13 +-
 accera/hat/include/HATEmitter.h               |   6 +
 accera/ir/include/value/ValueOps.td           |  10 +-
 accera/ir/src/AffineConstraintsHelper.cpp     |   2 +-
 accera/ir/src/TranslateToHeader.cpp           |   8 -
 accera/ir/src/value/ValueDialect.cpp          |   6 +-
 accera/python/accera/lang/Array.py            |  22 +-
 accera/python/accera/test/dsl_tests.py        |  49 +-
 accera/python/accera/test/smoke_tests.py      | 495 +++++++++++++-----
 .../include/value/ValueToLLVMLoweringPass.h   |   1 +
 .../value/ValueToStandardLoweringPass.h       |   2 +-
 .../ExecutionPlanToAffineLoweringPass.cpp     |  55 +-
 .../gpu/ConvertLaunchFuncToVulkanCalls.cpp    |   4 +-
 .../src/nest/LoopNestToValueFunc.cpp          |   8 +-
 .../src/util/RangeValueUtilities.cpp          |   7 +-
 .../src/value/ValueFuncToTargetPass.cpp       |   3 -
 .../src/value/ValueToLLVMLoweringPass.cpp     | 223 +++++++-
 .../src/value/ValueToStandardLoweringPass.cpp |  78 ++-
 .../src/vectorization/VectorizationUtil.cpp   |  66 ++-
 .../utilities/test/src/MemoryLayout_test.cpp  |  15 +-
 accera/value/include/Scalar.h                 |   2 +-
 accera/value/include/ScalarDimension.h        |   1 +
 accera/value/src/MLIREmitterContext.cpp       |  18 +-
 accera/value/src/ScalarDimension.cpp          |   3 +
 24 files changed, 852 insertions(+), 245 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b20f53a..5695293a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,12 +35,12 @@ option(STRICT_MODE "Build with 'warnings as errors'" OFF)
 option(USE_MKL "Build with Intel MKL" OFF)
 
 option(USE_LIBCXX "Build with libc++ if using the Clang compiler" OFF)
-if(CMAKE_CXX_COMPILER_ID STREQUAL Clang)
+if(CMAKE_CXX_COMPILER_ID STREQUAL Clang OR CMAKE_CXX_COMPILER_ID STREQUAL AppleClang)
   if(USE_LIBCXX OR (CMAKE_HOST_SYSTEM_NAME STREQUAL Darwin))
     add_compile_options(-stdlib=libc++)
     link_libraries(-lc++ -lc++abi)
   endif(USE_LIBCXX OR (CMAKE_HOST_SYSTEM_NAME STREQUAL Darwin))
-endif(CMAKE_CXX_COMPILER_ID STREQUAL Clang)
+endif(CMAKE_CXX_COMPILER_ID STREQUAL Clang OR CMAKE_CXX_COMPILER_ID STREQUAL AppleClang)
 
 # Try to create a compilation database, which is useful to have when working
 # with clang tooling
@@ -161,10 +161,13 @@ else()
   set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -ggdb3 -O0")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -ggdb3")
   set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -ggdb3")
-  if(${CMAKE_CXX_COMPILER_ID} STREQUAL Clang)
+
+  if(CMAKE_CXX_COMPILER_ID STREQUAL Clang OR CMAKE_CXX_COMPILER_ID STREQUAL AppleClang)
     if(CMAKE_BUILD_TYPE STREQUAL Debug)
-    # Set options for Control Flow Integrity
-      add_compile_options(-fsanitize=cfi)
+      # Set options for Control Flow Integrity
+      if(NOT ${OSX_NATIVE_ARCH} STREQUAL "arm64")
+        add_compile_options(-fsanitize=cfi)
+      endif()
     endif(CMAKE_BUILD_TYPE STREQUAL Debug)
 
     add_compile_options(-Wno-backslash-newline-escape)
diff --git a/accera/hat/include/HATEmitter.h b/accera/hat/include/HATEmitter.h
index 916ab167..91779905 100644
--- a/accera/hat/include/HATEmitter.h
+++ b/accera/hat/include/HATEmitter.h
@@ -294,6 +294,8 @@ class Parameter : public TOMLSerializable
         return SerializeCommonParameters();
     }
 
+    virtual ~Parameter() = default;
+
 protected:
     Parameter(const LogicalParamType& logicalType, const std::string& name, const std::string& description, const UsageType usage, const std::string& declaredType, const std::string& elementType) :
         _logicalType{ logicalType },
@@ -415,6 +417,8 @@ class Function : public TOMLSerializable
         _description{ description },
         _callingConvention{ callingConvention } {}
 
+    virtual ~Function() = default;
+
     std::string Name() const { return _name; }
 
     std::string Description() const { return _description; }
@@ -1021,6 +1025,8 @@ class ExternalLibraryReference : public TOMLSerializable
         return table;
     }
 
+    virtual ~ExternalLibraryReference() = default;
+
 private:
     std::string _name;
     std::string _version;
diff --git a/accera/ir/include/value/ValueOps.td b/accera/ir/include/value/ValueOps.td
index b57a37c2..5cb1b057 100644
--- a/accera/ir/include/value/ValueOps.td
+++ b/accera/ir/include/value/ValueOps.td
@@ -645,9 +645,13 @@ def accv_CallOp : accv_Op<"call", [CallOpInterface]> {
       $_state.addOperands(operands);
       $_state.addAttribute("callee", callee);
       $_state.addTypes(results);
-  }]>, OpBuilder<(ins "StringRef":$callee, "ArrayRef<Type>":$results, CArg<"ValueRange", "{}">:$operands), [{
-      build($_builder, $_state, StringAttr::get($_builder.getContext(), callee), results,
-            operands);
+
+  // BUGBUG: -Werror,-Winfinite-recursion, needed?
+  // }]>, OpBuilder<(ins "StringRef":$callee, "ArrayRef<Type>":$results, CArg<"ValueRange", "{}">:$operands), [{
+  //    build($_builder, $_state, StringAttr::get($_builder.getContext(), callee), results,
+  //          operands);
+  //
+
   }]>];
 
   let extraClassDeclaration = [{
diff --git a/accera/ir/src/AffineConstraintsHelper.cpp b/accera/ir/src/AffineConstraintsHelper.cpp
index 3f5c1eca..a0153ef6 100644
--- a/accera/ir/src/AffineConstraintsHelper.cpp
+++ b/accera/ir/src/AffineConstraintsHelper.cpp
@@ -200,7 +200,7 @@ namespace util
         if (_cst.containsId(val))
         {
             unsigned id = 0;
-            bool found = _cst.findId(val, &id);
+            [[maybe_unused]] bool found = _cst.findId(val, &id);
             assert(found);
             return IdWrapper::FromFullId(id, _cst);
         }
diff --git a/accera/ir/src/TranslateToHeader.cpp b/accera/ir/src/TranslateToHeader.cpp
index 63df849c..9b598f90 100644
--- a/accera/ir/src/TranslateToHeader.cpp
+++ b/accera/ir/src/TranslateToHeader.cpp
@@ -750,14 +750,6 @@ namespace ir
                     }
                     std::vector<size_t> shapeVec;
                     std::transform(shape.begin(), shape.end(), std::back_inserter(shapeVec), [](int64_t val) { return static_cast<size_t>(val); });
-                    if (usage != hat::UsageType::Input && shapeVec.size() == 1 && shapeVec[0] == 1)
-                    {
-                        // TODO: This is currently a hack since output Dimension does not work. So in the DSL we use Array
-                        // instead and here we emulate an ElementParameter instead. Remove this when output Dimension are working.
-                        assert(declaredType.back() == '*');
-                        return std::make_unique<hat::ElementParameter>(name, description, usage, declaredType.substr(0, declaredType.length() - 1), elementType);
-                    }
-
                     return std::make_unique<hat::AffineArrayParameter>(name, description, usage, declaredType, elementType, shapeVec, affineMap, affineOffset);
                 }
 
diff --git a/accera/ir/src/value/ValueDialect.cpp b/accera/ir/src/value/ValueDialect.cpp
index 2abd8a22..fc0f8215 100644
--- a/accera/ir/src/value/ValueDialect.cpp
+++ b/accera/ir/src/value/ValueDialect.cpp
@@ -511,9 +511,9 @@ MemRefType ViewOp::computeMemRefType(Value source, ValueRange sizes, ValueRange
     auto context = source.getContext();
     auto sourceMemRefType = source.getType().cast<mlir::MemRefType>();
     int64_t sourceRank = sourceMemRefType.getRank();
-    int64_t numOffsets = static_cast<int64_t>(offsets.size());
-    int64_t numSizes = static_cast<int64_t>(sizes.size());
-    int64_t numStrides = static_cast<int64_t>(strides.size());
+    [[maybe_unused]] int64_t numOffsets = static_cast<int64_t>(offsets.size());
+    [[maybe_unused]] int64_t numSizes = static_cast<int64_t>(sizes.size());
+    [[maybe_unused]] int64_t numStrides = static_cast<int64_t>(strides.size());
     assert(sourceRank == numOffsets);
     assert(sourceRank == numSizes);
     assert(sourceRank == numStrides);
diff --git a/accera/python/accera/lang/Array.py b/accera/python/accera/lang/Array.py
index cd076637..80bdb94b 100644
--- a/accera/python/accera/lang/Array.py
+++ b/accera/python/accera/lang/Array.py
@@ -11,7 +11,7 @@
 from operator import mul
 from typing import *
 
-from .._lang_python import ScalarType, _MemoryLayout, AllocateFlags, Role
+from .._lang_python import ScalarType, _MemoryLayout, AllocateFlags, Role, type_size_bytes
 from .._lang_python._lang import Array as NativeArray, Dimension
 from .Layout import Layout, MemoryMapLayout
 from ..Parameter import DelayedParameter
@@ -177,12 +177,15 @@ def _value(self):
             return None
 
     def _reinterpret_cast_internal(self, element_type):
-        if any(map(lambda d: isinstance(d, Dimension), self.shape)):
-            expected_layout = [-1]
+        src_element_size = type_size_bytes(self.element_type)
+        dst_element_size = type_size_bytes(element_type)
+        if src_element_size == dst_element_size:
+            expected_layout = self.shape
         else:
-            src_element_size = np.dtype(SCALAR_TYPE_TO_DTYPE_STR[self.element_type]).itemsize
-            dst_element_size = np.dtype(SCALAR_TYPE_TO_DTYPE_STR[element_type]).itemsize
-            expected_layout = [int(self._num_elements * (src_element_size / dst_element_size))]
+            if any(map(lambda d: isinstance(d, Dimension), self.shape)):
+                expected_layout = [-1]
+            else:
+                expected_layout = [int(self._num_elements * (src_element_size / dst_element_size))]
         reinterpreted = Array(role=self.role, element_type=element_type, shape=expected_layout)
         return reinterpreted
 
@@ -190,9 +193,12 @@ def _get_memory_buffer(self):
         return self._reinterpret_cast_internal(ScalarType.uint8)
 
     def _reinterpret_cast(self, element_type):
-        if self.element_type != ScalarType.uint8 or len(self.shape) != 1:
+        src_bytewidth = type_size_bytes(self.element_type)
+        dst_bytewidth = type_size_bytes(element_type)
+        if src_bytewidth != dst_bytewidth and \
+            (self.element_type != ScalarType.uint8 or len(self.shape) != 1):
             raise RuntimeError(
-                "Can only call reinterpret cast on flat uint8 memory buffers. Call _get_memory_buffer first?"
+                "Can only call reinterpret cast such that the bitwidth doesn't change, or on flat uint8 memory buffers. Call _get_memory_buffer first?"
             )
 
         return self._reinterpret_cast_internal(element_type)
diff --git a/accera/python/accera/test/dsl_tests.py b/accera/python/accera/test/dsl_tests.py
index c632fb1c..11f4462b 100644
--- a/accera/python/accera/test/dsl_tests.py
+++ b/accera/python/accera/test/dsl_tests.py
@@ -640,6 +640,22 @@ def test_reinterpret_cast(self) -> None:
             shape=(256, 256),
         )
 
+        def reinterpret_arr_as_int16(array: Array):
+            # Assumes array is f32
+
+            num_elements = reduce(lambda x, y: x*y, array.shape, 1)
+            arr_mb = array._get_memory_buffer()
+            self.assertEqual(arr_mb.shape, [num_elements * 4])
+            self.assertEqual(arr_mb.element_type, ScalarType.uint8)
+            print(arr_mb.layout)
+
+            arr_as_int16 = arr_mb._reinterpret_cast(ScalarType.int16)
+            self.assertEqual(arr_as_int16.shape, [num_elements * 2])
+            self.assertEqual(arr_as_int16.element_type, ScalarType.int16)
+            print(arr_as_int16.layout)
+
+            return arr_as_int16
+
         def reinterpret_arr_as_int32(array: Array):
             # Assumes array is f32
 
@@ -656,9 +672,19 @@ def reinterpret_arr_as_int32(array: Array):
 
             return arr_as_int32
 
+        def simple_reinterpret_arr_as_int32(array: Array):
+            # Assumes array is f32
+
+            num_elements = reduce(lambda x, y: x*y, array.shape, 1)
+            arr_as_int32 = array._reinterpret_cast(ScalarType.int32)
+            self.assertEqual(arr_as_int32.shape, array.shape)
+            self.assertEqual(arr_as_int32.element_type, ScalarType.int32)
+            print(arr_as_int32.layout)
+
+            return arr_as_int32
+
         # add a function that utilizes a subarray layout
-        def make_reinterpreted_fn(array):
-            reinterpreted = reinterpret_arr_as_int32(array)
+        def make_reinterpreted_fn(array, reinterpreted):
             nest = Nest(shape=reinterpreted.shape)
             i = nest.get_indices()
 
@@ -668,12 +694,23 @@ def _():
 
             return package.add(nest, args=(reinterpreted, ))
 
-        reinterpreted_fn = make_reinterpreted_fn(arr)
+        reinterpreted_i32 = reinterpret_arr_as_int32(arr)
+        reinterpreted_i32_fn = make_reinterpreted_fn(arr, reinterpreted_i32)
+
+        reinterpreted_i16 = reinterpret_arr_as_int16(arr)
+        reinterpreted_i16_fn = make_reinterpreted_fn(arr, reinterpreted_i16)
+
+        simple_reinterpreted_i32 = simple_reinterpret_arr_as_int32(arr)
+        simple_reinterpreted_i32_fn = make_reinterpreted_fn(arr, simple_reinterpreted_i32)
 
         # add a function that instantiates a subarray of the input array and calls the function above
         def main(array):
-            reinterpreted_array = reinterpret_arr_as_int32(array)
-            reinterpreted_fn(reinterpreted_array)
+            reinterpreted_array_i32 = reinterpret_arr_as_int32(array)
+            reinterpreted_array_i16 = reinterpret_arr_as_int16(array)
+            simple_reinterpreted_array_i32 = simple_reinterpret_arr_as_int32(array)
+            reinterpreted_i32_fn(reinterpreted_array_i32)
+            reinterpreted_i16_fn(reinterpreted_array_i16)
+            simple_reinterpreted_i32_fn(simple_reinterpreted_array_i32)
 
         package.add(main, args=(arr, ))
 
@@ -1432,7 +1469,7 @@ def _():
         }
 
         # TODO: Disabling this verification for now, re-enable it when undoing this change.
-        # self._verify_helper(package, get_size_fn_name, get_size_fn.name, correctness_check_values)
+        self._verify_helper(package, get_size_fn_name, get_size_fn.name, correctness_check_values)
 
         correctness_check_values = {
             "pre": [size_test, x_ref, start_array_pre_test, delta_test],
diff --git a/accera/python/accera/test/smoke_tests.py b/accera/python/accera/test/smoke_tests.py
index 5b312e05..d0499b71 100644
--- a/accera/python/accera/test/smoke_tests.py
+++ b/accera/python/accera/test/smoke_tests.py
@@ -5,17 +5,18 @@
 ####################################################################################################
 
 import inspect
-from itertools import product
-import os
-import sys
-import unittest
 import logging
+import os
 import pathlib
 import platform
 import shutil
-import numpy as np
+import sys
+import unittest
+from itertools import product
 from typing import Callable, List
 
+import numpy as np
+
 try:
     import cuda
 except:
@@ -48,13 +49,21 @@
     "public": False
 }
 
-from accera import Package, ScalarType, Nest, Array, Constants, Scalar, fuse, create_parameters, cast, Target, Role
-from accera._lang_python._lang import _MemorySpace, _MMAShape, Dimension
-from accera import min as accmin
+from accera._lang_python import _MemoryLayout
+from accera._lang_python._lang import Array as NativeArray
+from accera._lang_python._lang import Dimension, _MemorySpace, _MMAShape, _If
+from accera._lang_python._lang._gpu import Barrier
 from accera.samples import MatrixMultiplication
-from accera.test import verifiers
-from accera.test.test_utils import expectedFailure, FailedReason
 from accera.Targets import KNOWN_DEVICES
+from accera.test import verifiers
+from accera.test.test_utils import FailedReason, expectedFailure
+
+from accera import (
+    AUTO, AllocateFlags, Array, Constants, Nest, Package, Role, Scalar, ScalarType, Target, cast, create_dimensions,
+    create_parameters, fuse
+)
+from accera import min as accmin
+from accera import abs as accabs
 
 TEST_PACKAGE_DIR = "test_acccgen"
 
@@ -500,6 +509,7 @@ def _():
 
     def test_mlas_matmul(self) -> None:
         from itertools import combinations_with_replacement
+
         from accera.samples.MatrixMultiplication import MLAS
 
         domains = combinations_with_replacement([1, 31, 63, 127], 3)
@@ -616,7 +626,7 @@ def _():
             v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref))
 
     def _test_fast_exp_mlas(self, func_level_precision: bool):
-        from accera import fast_exp_mlas, fast_exp
+        from accera import fast_exp, fast_exp_mlas
 
         M = 64
         N = 64
@@ -647,9 +657,9 @@ def _():
         pkg_opt = Package._Options.NONE if func_level_precision else Package._Options.HIGH_PRECISION_FLOATING_POINT_OPS
 
         # Create a package and add our function definition to it
-        package_name = "test_fast_exp_mlas"
+        package_name = f"test_fast_exp_mlas_{'func' if func_level_precision else 'pkg'}"
         package = Package()
-        function = package.add(plan, args=(In, Out), base_name="test_fast_exp_mlas", function_opts=func_opt)
+        function = package.add(plan, args=(In, Out), base_name=package_name, function_opts=func_opt)
 
         # Build the HAT package
         with verifiers.VerifyPackage(self, package_name, TEST_PACKAGE_DIR) as v:
@@ -679,7 +689,8 @@ def test_fast_exp_mlas_w_pkg_level_precision(self):
         self._test_fast_exp_mlas(False)
 
     def test_emittime_cache_mlas_matmul(self) -> None:
-        from accera.samples.OfflineCacheMatrixMultiplication import EmitTimeCacheMLAS
+        from accera.samples.OfflineCacheMatrixMultiplication import \
+            EmitTimeCacheMLAS
 
         package = Package()
         M, N, K = [31, 63, 127]
@@ -705,7 +716,8 @@ def test_emittime_cache_mlas_matmul(self) -> None:
             v.check_correctness(function.name, before=(A_test, B_test, C_test), after=(A_test, B_test, C_ref))
 
     def test_runtime_init_cache_mlas_matmul(self) -> None:
-        from accera.samples.OfflineCacheMatrixMultiplication import RuntimeInitCacheMLAS
+        from accera.samples.OfflineCacheMatrixMultiplication import \
+            RuntimeInitCacheMLAS
 
         package = Package()
 
@@ -778,8 +790,6 @@ def _():
 
     def _make_vulkan_gpu_matmul_plan(self, M, N, K):
         import math
-        from accera import Target
-        from accera._lang_python._lang import _If, as_index
 
         def get_clamped_block_dimensions(M, N, base_block_dim_M=16, base_block_dim_N=16):
             return min(M, base_block_dim_M), min(N, base_block_dim_N)
@@ -883,8 +893,6 @@ def test_two_vulkan_gpu_matmul(self) -> None:
 
     @expectedFailure(FailedReason.NOT_IN_CORE, "function that contains multiple nests")
     def test_int8_matmul(self) -> None:
-        from accera import cast
-
         # Define our matrix sizes
         M = 128
         N = 256
@@ -1588,7 +1596,6 @@ def _fn():
 
     def test_dynamic_sub_array_split_dim_subfunction(self) -> None:
         # This is a contrived way to simply copy an array, but the utilities used are for packing partial higher dimensional arrays
-        from accera import create_dimensions
         test_name = "test_dynamic_sub_array_split_dim_subfunction"
 
         N = 64
@@ -1650,10 +1657,173 @@ def _outer_fn():
             test_output_ref = test_input.copy()
             v.check_correctness(function.name, before=(test_input, test_output), after=(test_input, test_output_ref))
 
+    def test_dim_bool_operation(self):
+        N = create_dimensions()
+        In = Array(Role.INPUT, element_type=ScalarType.float32, shape=(N, ))
+        Out = Array(Role.INPUT_OUTPUT, element_type=ScalarType.int64, shape=(1, ))
+
+        nest = Nest((1, ))
+
+        @nest.iteration_logic
+        def _():
+
+            def T():
+                Out[0] = 1
+
+            def F():
+                Out[0] = 0
+
+            _If(N > 10, T).Else(F)
+
+        name = "test_dim_bool_operation"
+        package = Package()
+        function = package.add(nest, args=(N, In, Out), base_name=name)
+
+        In_test = np.random.rand(12).astype(np.float32)
+        Out_test = np.array([-1], dtype=np.int64)
+        Out_ref = np.array([1], dtype=np.int64)
+
+        with verifiers.VerifyPackage(self, name, TEST_PACKAGE_DIR) as v:
+            package.build(
+                name,
+                format=self.PACKAGE_FORMAT,
+                mode=self.PACKAGE_MODE,
+                output_dir=TEST_PACKAGE_DIR,
+            )
+
+            v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref))
+
+    @expectedFailure(FailedReason.BUG, "unknown reason")
+    def test_dim_cast_bool_operation(self):
+        N = create_dimensions()
+        In = Array(Role.INPUT, element_type=ScalarType.float32, shape=(N, ))
+        Out = Array(Role.INPUT_OUTPUT, element_type=ScalarType.int64, shape=(1, ))
+
+        nest = Nest((1, ))
+
+        @nest.iteration_logic
+        def _():
+            Out[0] = cast(N > 10, ScalarType.int64)
+
+        name = "test_dim_cast_bool_operation"
+        package = Package()
+        function = package.add(nest, args=(N, In, Out), base_name=name)
+
+        In_test = np.random.rand(12).astype(np.float32)
+        Out_test = np.array([-1], dtype=np.int64)
+        Out_ref = np.array([1], dtype=np.int64)
+
+        with verifiers.VerifyPackage(self, name, TEST_PACKAGE_DIR) as v:
+            package.build(
+                name,
+                format=self.PACKAGE_FORMAT,
+                mode=self.PACKAGE_MODE,
+                output_dir=TEST_PACKAGE_DIR,
+            )
+
+            v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref))
+
+    def test_dim_arithmetic_operation_1(self):
+        N = create_dimensions()
+        In = Array(Role.INPUT, element_type=ScalarType.float32, shape=(N, ))
+        Out = Array(Role.INPUT_OUTPUT, element_type=ScalarType.int64, shape=(1, ))
+
+        nest = Nest((1, ))
+
+        @nest.iteration_logic
+        def _():
+            Out[0] = cast(N, ScalarType.int64) / 3
+
+        name = "test_dim_arithmetic_operation_1"
+        package = Package()
+        function = package.add(nest, args=(N, In, Out), base_name=name)
+
+        In_test = np.random.rand(12).astype(np.float32)
+        Out_test = np.array([-1], dtype=np.int64)
+        Out_ref = np.array([12 // 3], dtype=np.int64)
+
+        with verifiers.VerifyPackage(self, name, TEST_PACKAGE_DIR) as v:
+            package.build(
+                name,
+                format=self.PACKAGE_FORMAT,
+                mode=self.PACKAGE_MODE,
+                output_dir=TEST_PACKAGE_DIR,
+            )
+
+            v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref))
+
+    def test_dim_arithmetic_operation_2(self):
+        N1, N2 = create_dimensions()
+        In = Array(Role.INPUT, element_type=ScalarType.float32, shape=(N1, N2))
+        Out = Array(Role.INPUT_OUTPUT, element_type=ScalarType.int64, shape=(1, ))
+
+        nest = Nest((1, ))
+
+        @nest.iteration_logic
+        def _():
+            # is the difference between the dimensions odd
+            diff = cast(N1, ScalarType.int64) - cast(N2, ScalarType.int64)
+            is_odd = diff % 2
+            Out[0] = is_odd
+
+        name = "test_dim_arithmetic_operation_2"
+        package = Package()
+        function = package.add(nest, args=(N1, N2, In, Out), base_name=name)
+
+        In_test = np.random.rand(12, 8).astype(np.float32)
+        Out_test = np.array([-1], dtype=np.int64)
+        Out_ref = np.array([(12 - 8) % 2], dtype=np.int64)
+
+        with verifiers.VerifyPackage(self, name, TEST_PACKAGE_DIR) as v:
+            package.build(
+                name,
+                format=self.PACKAGE_FORMAT,
+                mode=self.PACKAGE_MODE,
+                output_dir=TEST_PACKAGE_DIR,
+            )
+
+            v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref))
+
+    def test_dim_arithmetic_operation_then_bool_op(self):
+        N = create_dimensions()
+        In = Array(Role.INPUT, element_type=ScalarType.float32, shape=(N, ))
+        Out = Array(Role.INPUT_OUTPUT, element_type=ScalarType.int64, shape=(1, ))
+
+        nest = Nest((1, ))
+
+        @nest.iteration_logic
+        def _():
+            mod2 = cast(N, ScalarType.int64) % 2
+
+            def T():
+                Out[0] = 1
+
+            def F():
+                Out[0] = 0
+
+            _If(mod2 == 0, T).Else(F)
+
+        name = "test_dim_arithmetic_operation_then_bool_op"
+        package = Package()
+        function = package.add(nest, args=(N, In, Out), base_name=name)
+
+        In_test = np.random.rand(12).astype(np.float32)
+        Out_test = np.array([-1], dtype=np.int64)
+        Out_ref = np.array([int(12 % 2 == 0)], dtype=np.int64)
+
+        with verifiers.VerifyPackage(self, name, TEST_PACKAGE_DIR) as v:
+            package.build(
+                name,
+                format=self.PACKAGE_FORMAT,
+                mode=self.PACKAGE_MODE,
+                output_dir=TEST_PACKAGE_DIR,
+            )
+
+            v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref))
+
     def test_dynamic_sub_array_multi_split_dim_subfunction(self) -> None:
         # Copy and pack a buffer into 2x4 tiles
         # Split the flat buffer into a 4-D buffer, where it has a truncated shape in the outer loop's cleanup loop
-        from accera import create_dimensions
 
         test_name = "test_dynamic_sub_array_multi_split_dim_subfunction"
 
@@ -1816,16 +1986,17 @@ def packed_index(i_outer, i_middle, i_inner, j_outer, j_middle, j_inner, tile_of
                                     )] = test_input[i_outer + i_middle + i_inner, j_outer + j_middle + j_inner]
             v.check_correctness(function.name, before=(test_input, test_output), after=(test_input, test_output_ref))
 
-    @expectedFailure(FailedReason.BUG, "_split_dimension of a dynamically sized dimension with a dynamic size is not working")
+    @expectedFailure(
+        FailedReason.BUG, "_split_dimension of a dynamically sized dimension with a dynamic size is not working"
+    )
     def test_dynamic_split_dim_dynamic_size(self) -> None:
-        from accera import create_dimensions
         test_name = "test_dynamic_split_dim_dynamic_size"
 
         M, N, MN = create_dimensions()
 
         package = Package()
 
-        Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(MN,))
+        Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(MN, ))
         Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(M, N))
 
         nest = Nest(shape=(M, N))
@@ -1836,11 +2007,7 @@ def _():
             split_input = Input._split_dimension(0, N)
             Output[i, j] = split_input[i, j]
 
-        fn = package.add(
-            nest,
-            args=(MN, M, N, Input, Output),
-            base_name=f"{test_name}_fn"
-        )
+        fn = package.add(nest, args=(MN, M, N, Input, Output), base_name=f"{test_name}_fn")
 
         output_dir = pathlib.Path(TEST_PACKAGE_DIR) / test_name
         shutil.rmtree(output_dir, ignore_errors=True)
@@ -1851,17 +2018,19 @@ def _():
             )
 
             # correctness check
-            test_M = 64
-            test_N = 16
-            test_MN = test_M*test_N
-            test_input = np.random.random([test_M*test_N]).astype(np.float32)
+            test_M = np.int64(64)
+            test_N = np.int64(16)
+            test_MN = np.int64(test_M * test_N)
+            test_input = np.random.random([test_M * test_N]).astype(np.float32)
             test_output = np.random.random([test_M, test_N]).astype(np.float32)
             test_output_ref = test_input.copy().reshape((test_M, test_N))
-            v.check_correctness(function.name, before=(test_MN, test_M, test_N, test_input, test_output), after=(test_MN, test_M, test_N, test_input, test_output_ref))
+            v.check_correctness(
+                fn.name,
+                before=(test_MN, test_M, test_N, test_input, test_output),
+                after=(test_MN, test_M, test_N, test_input, test_output_ref)
+            )
 
-    @expectedFailure(FailedReason.BUG, "_split_dimension of a dynamically sized dimension with a static size is not working")
     def test_dynamic_split_dim_static_size(self) -> None:
-        from accera import create_dimensions
         test_name = "test_dynamic_split_dim_static_size"
 
         M, MN = create_dimensions()
@@ -1869,6 +2038,51 @@ def test_dynamic_split_dim_static_size(self) -> None:
 
         package = Package()
 
+        Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(MN, ))
+        Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(M, N))
+
+        nest = Nest(shape=(M, N))
+        i, j = nest.get_indices()
+
+        @nest.iteration_logic
+        def _():
+            split_input = Input._split_dimension(0, cast(16, ScalarType.index))
+            Output[i, j] = split_input[i, j]
+
+        fn = package.add(nest, args=(MN, M, Input, Output), base_name=f"{test_name}_fn")
+
+        output_dir = pathlib.Path(TEST_PACKAGE_DIR) / test_name
+        shutil.rmtree(output_dir, ignore_errors=True)
+
+        with verifiers.VerifyPackage(self, test_name, output_dir) as v:
+            package.build(
+                name=test_name, format=self.PACKAGE_FORMAT, mode=self.PACKAGE_MODE, output_dir=output_dir, _quiet=False
+            )
+
+            # correctness check
+            test_M = np.int64(64)
+            test_N = N
+            test_MN = np.int64(test_M * test_N)
+            test_input = np.random.random([test_M * test_N]).astype(np.float32)
+            test_output = np.random.random([test_M, test_N]).astype(np.float32)
+            test_output_ref = test_input.copy().reshape((test_M, test_N))
+            v.check_correctness(
+                fn.name,
+                before=(test_MN, test_M, test_input, test_output),
+                after=(test_MN, test_M, test_input, test_output_ref)
+            )
+
+    # This test uses all static sizes to make sure the fix for dynamic size (test_dynamic_split_dim_static_size) 
+    # won't regress the static size case.
+    def test_dynamic_split_dim_all_static(self) -> None:
+        test_name = "test_dynamic_split_dim_all_static"
+
+        M = 8
+        MN = 128
+        N = 16
+
+        package = Package()
+
         Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(MN,))
         Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(M, N))
 
@@ -1882,7 +2096,7 @@ def _():
 
         fn = package.add(
             nest,
-            args=(MN, M, Input, Output),
+            args=(Input, Output),
             base_name=f"{test_name}_fn"
         )
 
@@ -1895,13 +2109,14 @@ def _():
             )
 
             # correctness check
-            test_M = 64
+            test_M = 8
             test_N = N
             test_MN = test_M*test_N
             test_input = np.random.random([test_M*test_N]).astype(np.float32)
             test_output = np.random.random([test_M, test_N]).astype(np.float32)
             test_output_ref = test_input.copy().reshape((test_M, test_N))
-            v.check_correctness(function.name, before=(test_MN, test_M, test_input, test_output), after=(test_MN, test_M, test_input, test_output_ref))
+            v.check_correctness(fn.name, before=(test_input, test_output), after=(test_input, test_output_ref))
+
 
     def test_padded_nchwc_conv2d_manual_cache(self) -> None:
         input_channels = 64
@@ -2066,7 +2281,6 @@ def _():
             package.build(name=package_name, format=self.PACKAGE_FORMAT, mode=self.PACKAGE_MODE, output_dir=output_dir)
 
     def test_cross_compile(self) -> None:
-        from accera import Target
         M = 128
         N = 256
         K = 256
@@ -2891,8 +3105,6 @@ def test_boundary_differently_shaped_budget_cache(self) -> None:
         self._verify_matrix_multiplication_function(function, package, f"test_boundary_differently_shaped_budget_cache")
 
     def test_gpu_vec_add(self):
-        from accera import Array, Nest, Package, ScalarType, Target
-
         # Define our vector sizes
         N = 2**16
         block_x = 256
@@ -2941,8 +3153,6 @@ def _():
                 v.check_correctness(function.name, before=before, after=after)
 
     def _test_gpu_vec_add_boundary(self, N, splits, test_name):
-        from accera import Array, Nest, Package, ScalarType, Target
-
         A = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, ))
         B = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, ))
         C = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N, ))
@@ -3003,8 +3213,6 @@ def _():
                 v.check_correctness(function.name, before=before, after=after)
 
     def _test_cpu_vec_add_boundary(self, N, splits, test_name):
-        from accera import Array, Nest, Package, ScalarType, Target
-
         A = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, ))
         B = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, ))
         C = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N, ))
@@ -3063,9 +3271,6 @@ def test_gpu_vec_add_gpu_boundary_2_splits_cpuonly(self):
         self._test_cpu_vec_add_boundary(1280, [512, 64], inspect.currentframe().f_code.co_name)
 
     def _add_cuda_copy_kernel(self, package, N, block_x, block_y, target, basename="cuda_copy_kernel"):
-        from accera import Array, Nest, ScalarType
-        from accera._lang_python._lang import _MemorySpace
-
         In = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, N))
         Out = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N, N))
 
@@ -3098,8 +3303,6 @@ def _():
         return function
 
     def test_cuda_module_output(self) -> None:
-        from accera import Package, Target
-
         N = 2048
         block_x = 16
         block_y = block_x
@@ -3128,8 +3331,6 @@ def test_cuda_module_output(self) -> None:
                 v.check_correctness(function.name, before=(Input_test, Output_test), after=(Input_ref, Output_ref))
 
     def test_cuda_multiple_funcs(self) -> None:
-        from accera import Package, Target
-
         Ns = [1024, 2048]
         block_x = 16
         block_y = block_x
@@ -3161,9 +3362,6 @@ def test_cuda_multiple_funcs(self) -> None:
                     v.check_correctness(function.name, before=(Input_test, Output_test), after=(Input_ref, Output_ref))
 
     def _add_rocm_copy_kernel(self, package, N, block_x, block_y, target, basename="rocm_copy_kernel"):
-        from accera import Array, Nest, ScalarType
-        from accera._lang_python._lang import _MemorySpace
-
         In = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, N))
         Out = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N, N))
 
@@ -3197,8 +3395,6 @@ def _():
         return function
 
     def test_rocm_module_output(self) -> None:
-        from accera import Package, Target
-
         # Define our vector sizes
         N = 32
         block_x = 16
@@ -3228,8 +3424,6 @@ def test_rocm_module_output(self) -> None:
                 v.check_correctness(function.name, before=before, after=after)
 
     def test_rocm_multiple_funcs(self) -> None:
-        from accera import Package, Target
-
         Ns = [1024, 2048]
         block_x = 16
         block_y = block_x
@@ -3272,8 +3466,6 @@ def _gpu_cache(
         double_buffer=False,
         double_buffer_location=Constants.AUTO
     ) -> None:
-        from accera import Array, Nest, Package, ScalarType, Target
-
         A = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(M, K), layout=Array.Layout.FIRST_MAJOR)
         B = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(K, N), layout=Array.Layout.FIRST_MAJOR)
         C = Array(
@@ -3343,9 +3535,6 @@ def test_gpu_cache_double_buffering(self) -> None:
         self._gpu_cache(2560, 1536, 2048, 16, 16, 32, "test_gpu_cache_double_buffering", True)
 
     def test_gpu_cache_double_buffering_trigger_index(self) -> None:
-        from accera import Array, Nest, Package, ScalarType, Target
-        from accera._lang_python._lang import _MemorySpace
-
         M = 2560
         N = 1536
         K = 2048
@@ -3426,8 +3615,6 @@ def test_gpu_cache_double_buffering_mem_space(self) -> None:
         )
 
     def test_cpu_cache_double_buffering_trigger_index(self) -> None:
-        from accera import Array, Nest, Package, ScalarType
-
         M = 1024
         N = 1024
         K = 1024
@@ -3511,7 +3698,6 @@ def _():
 
     # TODO : move vpmaddwd tests to a different test file
     def test_signextend_int16_matmul_vpmaddwd(self):
-        from accera import AllocateFlags, create_dimensions
         test_name = "test_signextend_int16_matmul_vpmaddwd"
 
         def inout_array(arr: Array):
@@ -4251,10 +4437,10 @@ def test_f32_horizontal_vector_add_1_row(self):
         test_name = "test_f32_horizontal_vector_add_1_row"
         N = 8
 
-        A = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N,), layout=Array.Layout.FIRST_MAJOR)
+        A = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, ), layout=Array.Layout.FIRST_MAJOR)
         B = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(1, ), layout=Array.Layout.FIRST_MAJOR)
 
-        nest = Nest(shape=(N,))
+        nest = Nest(shape=(N, ))
         i, = nest.get_indices()
 
         @nest.iteration_logic
@@ -4268,8 +4454,8 @@ def _():
         package = Package()
         function = package.add(plan, args=(A, B), base_name=test_name)
 
-        A_test = np.random.random((N,)).astype(np.float32)
-        B_test = np.random.random((1,)).astype(np.float32)
+        A_test = np.random.random((N, )).astype(np.float32)
+        B_test = np.random.random((1, )).astype(np.float32)
 
         B_ref = B_test.copy()
         for i in range(N):
@@ -4300,10 +4486,10 @@ def test_i32_horizontal_vector_add_1_row(self):
         test_name = "test_i32_horizontal_vector_add_1_row"
         N = 8
 
-        A = Array(role=Role.INPUT, element_type=ScalarType.int32, shape=(N,), layout=Array.Layout.FIRST_MAJOR)
+        A = Array(role=Role.INPUT, element_type=ScalarType.int32, shape=(N, ), layout=Array.Layout.FIRST_MAJOR)
         B = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.int32, shape=(1, ), layout=Array.Layout.FIRST_MAJOR)
 
-        nest = Nest(shape=(N,))
+        nest = Nest(shape=(N, ))
         i, = nest.get_indices()
 
         @nest.iteration_logic
@@ -4317,8 +4503,8 @@ def _():
         package = Package()
         function = package.add(plan, args=(A, B), base_name=test_name)
 
-        A_test = np.random.random((N,)).astype(np.int32)
-        B_test = np.random.random((1,)).astype(np.int32)
+        A_test = np.random.random((N, )).astype(np.int32)
+        B_test = np.random.random((1, )).astype(np.int32)
 
         B_ref = B_test.copy()
         for i in range(N):
@@ -4612,11 +4798,6 @@ def test_matmul_input_cache_element_type_uint_to_int(self) -> None:
         )
 
     def test_gpu_barrier_opt(self) -> None:
-        from accera import Array, Nest, Package, ScalarType, Target
-        from accera._lang_python._lang import Allocate, _MemorySpace, Array as NativeArray
-        from accera._lang_python._lang._gpu import Barrier
-        from accera._lang_python import _MemoryLayout
-
         N = 256
         block_x = 16
 
@@ -4682,8 +4863,6 @@ def _():
                 v.check_correctness(function.name, before=before, after=after)
 
     def test_rocm_gemm_tiled_output(self) -> None:
-        from accera import Array, Nest, Package, ScalarType, Target
-
         M = 16
         N = M
         K = M
@@ -5448,8 +5627,6 @@ def run_file_check(verifier):
     #     import accera as acc
 
     #     # TODO : update once MemorySpace is better surfaced
-    #     from accera._lang_python._lang import _MemorySpace
-
     #     package = Package()
 
     #     M = 32
@@ -5502,8 +5679,6 @@ def run_file_check(verifier):
     #     import accera as acc
 
     #     # TODO : update once MemorySpace is better surfaced
-    #     from accera._lang_python._lang import _MemorySpace
-
     #     package = Package()
 
     #     M = 32
@@ -5735,8 +5910,6 @@ def run_file_check(verifier):
     #     self._verify_matrix_multiplication_function(function, package, f"test_thrifty_caching_elide_boundary_no_elide_main")
 
     def test_gpu_cache_different_input_layouts(self):
-        from accera import Array, Nest, Package, ScalarType, Target
-        from accera._lang_python._lang import _MemorySpace
 
         M = 2560
         N = 1536
@@ -5858,9 +6031,6 @@ def test_gpu_cache_block_level_private_mem(self):
         # This test verifies that a private memory cache will compute a region specific to each thread
         # even when added at the block level of the loopnest
 
-        from accera import Array, Nest, Package, ScalarType, Target
-        from accera._lang_python._lang import _MemorySpace
-
         M = 2560
         N = 1536
         K = 2048
@@ -5943,9 +6113,6 @@ def test_gpu_cache_block_level_shared_mem(self):
         # This test verifies that a shared memory cache will compute a region specific to each logical block
         # even when added outside the block level of the loopnest
 
-        from accera import Array, Nest, Package, ScalarType, Target
-        from accera._lang_python._lang import _MemorySpace
-
         M = 2560
         N = 1536
         K = 2048
@@ -6030,9 +6197,6 @@ def test_gpu_cache_block_level_global_mem(self):
         # This test verifies that a global memory cache will compute a region specific to each logical block
         # even when added outside the block level of the loopnest
 
-        from accera import Array, Nest, Package, ScalarType, Target
-        from accera._lang_python._lang import _MemorySpace
-
         M = 2560
         N = 1536
         K = 2048
@@ -6114,7 +6278,6 @@ def file_check_fn(verifier):
         )
 
     def test_vectorized_and_unvectorized_cpu_caches(self):
-        from accera import AUTO
         M = 512
         N = 512
         S = 512
@@ -6165,8 +6328,6 @@ def _():
         self._verify_matrix_multiplication_function(function, package, f"test_vectorized_and_unvectorized_cpu_caches")
 
     def test_rocm_cache_double_buffering__with_c_cache_tensorize(self) -> None:
-        from accera import Array, Nest, Package, ScalarType, Target
-
         M = 1024
         N = 1024
         K = 1024
@@ -6247,8 +6408,6 @@ def _():
         )
 
     def test_rocm_c_cache_private(self) -> None:
-        from accera import Array, Nest, Package, ScalarType, Target
-
         M = 1024
         N = 1024
         K = 1024
@@ -6311,9 +6470,6 @@ def _():
         )
 
     def test_fill_fp16(self):
-        from accera import Array, Nest, Package, ScalarType
-        from accera import cast
-
         # Define our vector sizes
         N = 2**16
 
@@ -6348,9 +6504,6 @@ def fill_fp16():
             v.check_correctness(function.name, before=(Output_test, ), after=(Output_ref, ))
 
     def test_abs_fp16(self):
-        from accera import Array, Nest, Package, ScalarType, Target
-        from accera import abs
-
         # Define our vector sizes
         N = 16
 
@@ -6362,7 +6515,7 @@ def test_abs_fp16(self):
 
         @nest.iteration_logic
         def _():
-            Out[i] = abs(In[i])
+            Out[i] = accabs(In[i])
 
         schedule = nest.create_schedule()
         plan = schedule.create_plan()
@@ -6386,8 +6539,6 @@ def abs_fp16(a):
             v.check_correctness(function.name, before=(Input_test, Output_test), after=(Input_test, Output_ref))
 
     def test_vec_add_fp16(self):
-        from accera import Array, Nest, Package, ScalarType
-
         # Define our vector sizes
         N = 2**16
 
@@ -6430,8 +6581,6 @@ def vecadd_ref(a, b):
             )
 
     def test_rocm_tensorize_fp16(self) -> None:
-        from accera import Array, Nest, Package, ScalarType, Target
-
         M = 1024
         N = 1024
         K = 1024
@@ -6494,8 +6643,6 @@ def _():
         )
 
     def test_rocm_cache_double_buffering_tensorize_fp16(self) -> None:
-        from accera import Array, Nest, Package, ScalarType, Target
-
         M = 1024
         N = 1024
         K = 1024
@@ -6575,8 +6722,6 @@ def _():
         )
 
     def test_rocm_double_buffer_small_cache_vectorized_unvectorized_tensorized(self) -> None:
-        from accera import Array, Nest, Package, ScalarType, Target
-
         M = 512
         N = 512
         K = 512
@@ -6907,7 +7052,6 @@ def _():
             )
 
     def test_vectorized_masked_buffer_fill(self) -> None:
-        from accera._lang_python._lang import _If
         N_input = 5
         N_output = 8
         Input = Array(role=Role.INPUT, element_type=ScalarType.int32, shape=(N_input, ))
@@ -6943,7 +7087,6 @@ def store_zero():
             )
 
     def test_vectorized_masked_store(self) -> None:
-        from accera._lang_python._lang import _If
         N_input = 8
         N_output = 5
         Input = Array(role=Role.INPUT, element_type=ScalarType.int32, shape=(N_input, ))
@@ -6976,7 +7119,6 @@ def store_value():
             )
 
     def test_vectorized_masked_accumulate(self) -> None:
-        from accera._lang_python._lang import _If
         N_input = 8
         N_output = 5
         Input = Array(role=Role.INPUT, element_type=ScalarType.int32, shape=(N_input, ))
@@ -7008,6 +7150,40 @@ def store_value():
                 output_dir=output_dir
             )
 
+    def test_vectorized_masked_constant_scale_store(self) -> None:
+        from accera._lang_python._lang import _If
+        package_name = "test_vectorized_masked_constant_scale_store"
+        N_input = 8
+        N_output = 5
+        scale = 0.2
+        Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N_input, ))
+        Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N_output, ))
+        package = Package()
+        nest = Nest(shape=(N_input, ))
+        i, = nest.get_indices()
+
+        @nest.iteration_logic
+        def _nest():
+
+            def store_value():
+                Output[i] = Input[i] * scale
+
+            _If(i < N_output, store_value)
+
+        sched = nest.create_schedule()
+        plan = sched.create_plan()
+        plan.vectorize(i)
+        fn = package.add(plan, args=(Input, Output), base_name=package_name)
+        output_dir = pathlib.Path(TEST_PACKAGE_DIR) / package_name
+        shutil.rmtree(output_dir, ignore_errors=True)
+        with verifiers.VerifyPackage(self, package_name, output_dir) as v:
+            package.build(
+                name=package_name,
+                format=self.PACKAGE_FORMAT | Package.Format.MLIR_VERBOSE,
+                mode=self.PACKAGE_MODE,
+                output_dir=output_dir
+            )
+
     def test_packing_floordiv_mod_no_splits(self) -> None:
         package_name = "test_packing_floordiv_mod_no_splits"
         M = 256
@@ -7107,6 +7283,83 @@ def _nest():
                 output_dir=output_dir
             )
 
+    def test_dynamic_temp_array(self) -> None:
+        import accera as acc
+        test_name = "test_dynamic_temp_array"
+
+        package = Package()
+
+        M, N = acc.create_dimensions()
+
+        A = acc.Array(role=acc.Role.INPUT, shape=(M, N), layout=acc.Array.Layout.FIRST_MAJOR)
+        B = acc.Array(role=acc.Role.TEMP, shape=(M, N), layout=acc.Array.Layout.FIRST_MAJOR)
+        C = acc.Array(role=acc.Role.INPUT_OUTPUT, shape=(N, M), layout=acc.Array.Layout.FIRST_MAJOR)
+
+        nest = acc.Nest(shape=(M, N))
+        i, j = nest.get_indices()
+
+        @nest.iteration_logic
+        def _():
+            B[i, j] = A[i, j]
+            C[j, i] = B[i, j]
+
+        function = package.add(nest, args=(M, N, A, C), base_name=test_name)
+
+        output_dir = pathlib.Path(TEST_PACKAGE_DIR) / test_name
+        with verifiers.VerifyPackage(self, test_name, output_dir) as v:
+            package.build(
+                name=test_name,
+                format=self.PACKAGE_FORMAT | Package.Format.MLIR,
+                mode=self.PACKAGE_MODE,
+                output_dir=output_dir
+            )
+
+            checker = v.file_checker(f"*_ConvertValueToStd.mlir")
+            checker.check_not('memref.global')
+            checker.check_not('memref.get_global')
+            checker.check('memref.alloc(%arg0, %arg1)')
+            checker.check('memref.dealloc')
+            checker.run()
+
+    def test_static_temp_array(self) -> None:
+        import accera as acc
+        test_name = "test_static_temp_array"
+
+        package = Package()
+
+        M = 16
+        N = 32
+
+        A = acc.Array(role=acc.Role.INPUT, shape=(M, N), layout=acc.Array.Layout.FIRST_MAJOR)
+        B = acc.Array(role=acc.Role.TEMP, shape=(M, N), layout=acc.Array.Layout.FIRST_MAJOR)
+        C = acc.Array(role=acc.Role.INPUT_OUTPUT, shape=(N, M), layout=acc.Array.Layout.FIRST_MAJOR)
+
+        nest = acc.Nest(shape=(M, N))
+        i, j = nest.get_indices()
+
+        @nest.iteration_logic
+        def _():
+            B[i, j] = A[i, j]
+            C[j, i] = B[i, j]
+
+        function = package.add(nest, args=(A, C), base_name=test_name)
+
+        output_dir = pathlib.Path(TEST_PACKAGE_DIR) / test_name
+        with verifiers.VerifyPackage(self, test_name, output_dir) as v:
+            package.build(
+                name=test_name,
+                format=self.PACKAGE_FORMAT | Package.Format.MLIR,
+                mode=self.PACKAGE_MODE,
+                output_dir=output_dir
+            )
+
+            checker = v.file_checker(f"*_ConvertValueToStd.mlir")
+            checker.check('memref.global')
+            checker.check('memref.get_global')
+            checker.check_not('memref.alloc')
+            checker.check_not('memref.dealloc')
+            checker.run()
+
 
 if __name__ == '__main__':
     unittest.main(verbosity=10)
diff --git a/accera/transforms/include/value/ValueToLLVMLoweringPass.h b/accera/transforms/include/value/ValueToLLVMLoweringPass.h
index ffc6cd4c..62a73dc4 100644
--- a/accera/transforms/include/value/ValueToLLVMLoweringPass.h
+++ b/accera/transforms/include/value/ValueToLLVMLoweringPass.h
@@ -38,6 +38,7 @@ void populateValueToLLVMNonMemPatterns(mlir::LLVMTypeConverter& typeConverter, m
 void populateGlobalValueToLLVMNonMemPatterns(mlir::LLVMTypeConverter& typeConverter, mlir::RewritePatternSet& patterns);
 void populateLocalValueToLLVMNonMemPatterns(mlir::LLVMTypeConverter& typeConverter, mlir::RewritePatternSet& patterns);
 void populateValueToLLVMMemPatterns(mlir::LLVMTypeConverter& typeConverter, mlir::RewritePatternSet& patterns);
+void populateReshapeOpToLLVMMemPatterns(mlir::LLVMTypeConverter& typeConverter, mlir::RewritePatternSet& patterns);
 
 const mlir::LowerToLLVMOptions& GetDefaultAcceraLLVMOptions(mlir::MLIRContext* context);
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createValueToLLVMPass(mlir::LowerToLLVMOptions options);
diff --git a/accera/transforms/include/value/ValueToStandardLoweringPass.h b/accera/transforms/include/value/ValueToStandardLoweringPass.h
index f7682ec3..65bd4cbc 100644
--- a/accera/transforms/include/value/ValueToStandardLoweringPass.h
+++ b/accera/transforms/include/value/ValueToStandardLoweringPass.h
@@ -27,7 +27,7 @@ struct ProfileRegions;
 namespace accera::transforms::value
 {
 void populateVectorizeValueOpPatterns(mlir::RewritePatternSet& patterns);
-void populateValueToStandardPatterns(bool enableProfiling, ProfileRegions& profileRegions, mlir::RewritePatternSet& patterns);
+[[maybe_unused]] void populateValueToStandardPatterns(bool enableProfiling, ProfileRegions& profileRegions, mlir::RewritePatternSet& patterns);
 void populateValueLaunchFuncPatterns(mlir::RewritePatternSet& patterns);
 void populateValueModuleRewritePatterns(mlir::RewritePatternSet& patterns);
 
diff --git a/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp b/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp
index 21989a90..562bcc1c 100644
--- a/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp
+++ b/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp
@@ -744,7 +744,7 @@ std::vector<size_t> GetMajorToMinorDimensionTraversal(const mlir::MemRefType& so
 
     llvm::SmallVector<int64_t, 4> strides;
     int64_t offset;
-    auto strideResult = mlir::getStridesAndOffset(sourceType, strides, offset);
+    [[maybe_unused]] auto strideResult = mlir::getStridesAndOffset(sourceType, strides, offset);
     assert(succeeded(strideResult));
     std::vector<std::pair<int64_t, size_t>> strideAndLogicalDims;
     size_t dim = 0;
@@ -1081,7 +1081,7 @@ struct MultiCacheInfo
 
 std::pair<mlir::Value, mlir::ValueRange> GetAccessValueAndIndices(Operation* loadOrStoreOp)
 {
-    bool isLoadOrStore = isa<memref::StoreOp, mlir::AffineStoreOp, v::StoreOp, v::MMAStoreSyncOp, mlir::memref::LoadOp, mlir::AffineLoadOp, v::LoadOp, v::MMALoadSyncOp>(loadOrStoreOp);
+    [[maybe_unused]] bool isLoadOrStore = isa<memref::StoreOp, mlir::AffineStoreOp, v::StoreOp, v::MMAStoreSyncOp, mlir::memref::LoadOp, mlir::AffineLoadOp, v::LoadOp, v::MMALoadSyncOp>(loadOrStoreOp);
     assert(isLoadOrStore);
     if (auto stdStoreOp = dyn_cast_or_null<memref::StoreOp>(loadOrStoreOp))
     {
@@ -1123,7 +1123,7 @@ std::pair<mlir::Value, mlir::ValueRange> GetAccessValueAndIndices(Operation* loa
         v::MMALoadSyncOp::Adaptor adaptor{ valueMMALoadSyncOp };
         return std::make_pair(adaptor.memref(), adaptor.indices());
     }
-    assert(false && "Unhandled load/store case");
+    throw LogicException(LogicExceptionErrors::notImplemented, "Unhandled load/store case");
 }
 
 bool ComputeRegionAccessedByOp(PatternRewriter& rewriter, mlir::MemRefRegion& activeBlockRegion, mlir::Operation* op, unsigned loopDepth, const std::unordered_map<Index, mlir::Value>& handlesToKeepParametric = {})
@@ -1133,7 +1133,7 @@ bool ComputeRegionAccessedByOp(PatternRewriter& rewriter, mlir::MemRefRegion& ac
 
     if (isa<mlir::AffineLoadOp, mlir::AffineStoreOp, v::MMALoadSyncOp, v::MMAStoreSyncOp>(op))
     {
-        auto result = ComputeMemrefRegion(activeBlockRegion, op, loopDepth, nullptr, false, handlesToKeepParametric);
+        [[maybe_unused]] auto result = ComputeMemrefRegion(activeBlockRegion, op, loopDepth, nullptr, false, handlesToKeepParametric);
         assert(succeeded(result));
         return true;
     }
@@ -1163,11 +1163,11 @@ bool IsCacheParametricOnGPUProc(mlir::Attribute cacheMemSpace, v::Processor gpuP
                    gpuProc == v::Processor::BlockY ||
                    gpuProc == v::Processor::BlockZ;
 
-    bool isThread = gpuProc == v::Processor::ThreadX ||
-                    gpuProc == v::Processor::ThreadY ||
-                    gpuProc == v::Processor::ThreadZ ||
-                    gpuProc == v::Processor::WarpX ||
-                    gpuProc == v::Processor::WarpY;
+    [[maybe_unused]] bool isThread = gpuProc == v::Processor::ThreadX ||
+                                     gpuProc == v::Processor::ThreadY ||
+                                     gpuProc == v::Processor::ThreadZ ||
+                                     gpuProc == v::Processor::WarpX ||
+                                     gpuProc == v::Processor::WarpY;
 
     assert((isBlock || isThread) && "Loops that are bound to GPU proc handles other than block and thread handles are not well defined");
 
@@ -1277,7 +1277,7 @@ std::optional<ArrayAccessInfo> ComputeAccessInfoForArrayAtLevel(PatternRewriter&
                 {
                     // Validate that the held loop has the same gpuProc and gpuProcMap as the current one, otherwise throw
                     auto heldForOp = mlir::getForInductionVarOwner(findIt->second);
-                    auto&& [heldGpuProc, heldGpuProcMap] = getProcAndMap(heldForOp);
+                    [[maybe_unused]] auto&& [heldGpuProc, heldGpuProcMap] = getProcAndMap(heldForOp);
                     assert((heldGpuProc == gpuProc) && (heldGpuProcMap == gpuProcMap) && "Found duplicate index amongst for ops but they had different GPU mappings");
                 }
                 else
@@ -1408,7 +1408,7 @@ std::optional<ArrayAccessInfo> ComputeAccessInfoForArrayAtLevel(PatternRewriter&
                     }
                     else
                     {
-                        auto unionResult = result.activeBlock.unionBoundingBox(activeBlockRegion);
+                        [[maybe_unused]] auto unionResult = result.activeBlock.unionBoundingBox(activeBlockRegion);
                         assert(succeeded(unionResult));
 
                         result.activeBlock.cst.removeRedundantConstraints();
@@ -1749,6 +1749,7 @@ mlir::Value GetOriginalIV(mlir::Value possiblyOffsetIV)
             assert(false && "Offset IVs must be offset with AffineApplyOps and constants");
         }
     }
+    return nullptr;
 }
 
 mlir::AffineMap ComputeLoopIVToDefinitionOrderMap(const std::vector<mlir::Value>& ivs, mlir::MLIRContext* context)
@@ -1802,7 +1803,7 @@ mlir::AffineMap ComputeLoopIVToDefinitionOrderMap(const std::vector<mlir::Value>
             otherDefiningOp = otherIVBlockArg.getOwner()->getParentOp();
         }
         bool currentIsAncestor = currentDefiningOp->isAncestor(otherDefiningOp);
-        bool otherIsAncestor = otherDefiningOp->isAncestor(currentDefiningOp);
+        [[maybe_unused]] bool otherIsAncestor = otherDefiningOp->isAncestor(currentDefiningOp);
         assert((currentIsAncestor || otherIsAncestor) && "ComputeLoopIVDefinitionOrder only works on nested AffineForOp IVs");
         return currentIsAncestor;
     });
@@ -2400,7 +2401,7 @@ LogicalResult ActiveElementCacheCopyOpRewrite::matchAndRewrite(ActiveElementCach
     assert(dst.getType().isa<MemRefType>());
     auto dstMemRefType = dst.getType().cast<MemRefType>();
     const v::MemorySpace dstMemRefSpace{ dstMemRefType.getMemorySpaceAsInt() };
-    auto baseDstElementType = GetInnerElementType(dst); // e.g. f32
+    [[maybe_unused]] auto baseDstElementType = GetInnerElementType(dst); // e.g. f32
 
     assert(baseSrcElementType == baseDstElementType && "Copy source and dest data types don't match");
 
@@ -2794,7 +2795,7 @@ LogicalResult ActiveBlockCacheCopyOpRewrite::matchAndRewrite(ActiveBlockCacheCop
 
                         llvm::SmallVector<int64_t, 4> outerArrayStrides;
                         int64_t activeBlockOffset; // TODO : do we need to leverage this in any way? we're currently just arranging the threads according to fast/slow dimensions of the logical memref
-                        auto strideResult = mlir::getStridesAndOffset(memRefType, outerArrayStrides, activeBlockOffset);
+                        [[maybe_unused]] auto strideResult = mlir::getStridesAndOffset(memRefType, outerArrayStrides, activeBlockOffset);
                         assert(succeeded(strideResult));
                         auto numOuterArrayMultiCacheDims = outerArrayStrides.size() - activeBlockRank;
                         std::vector<int64_t> outerArrayActiveBlockStrides(outerArrayStrides.begin() + numOuterArrayMultiCacheDims, outerArrayStrides.end());
@@ -3803,6 +3804,7 @@ mlir::Value FindParentAffineForOpIV(mlir::Operation* op, const Index& loopnestIn
         currentParentForOp = currentParentForOp->getParentOfType<mlir::AffineForOp>();
     }
     assert(false && "Given loopnest index does not correspond to a parent AffineForOp");
+    return nullptr;
 }
 
 std::vector<mlir::Value> ResolveParentRelevantScheduleIndices(mlir::Operation* op, const mlir::ValueRange& baseRelevantScheduleIndices)
@@ -4660,7 +4662,7 @@ LogicalResult MergeCacheRegionOpsRewrite::matchAndRewrite(BeginCreateCacheOp beg
     std::vector<mlir::Operation*> beginOpsForRemoval;
     std::vector<mlir::Operation*> endOpsForRemoval;
 
-    auto baseArray = beginCreateCacheOp.baseInput();
+    [[maybe_unused]] auto baseArray = beginCreateCacheOp.baseInput();
     // If the outermost loop in this cache region is used to index into the base array then the cache regions cannot be merged
     // as they will have different access patterns
 
@@ -5135,7 +5137,7 @@ LogicalResult BeginCreateCacheOpRewrite::matchAndRewrite(BeginCreateCacheOp begi
 
                 if (!cachesHaveSameShape)
                 {
-                    auto unionResult = matchingExistingInfoIter->arrayAccessInfo.activeBlock.unionBoundingBox(currentMultiCacheInfo.arrayAccessInfo.activeBlock);
+                    [[maybe_unused]] auto unionResult = matchingExistingInfoIter->arrayAccessInfo.activeBlock.unionBoundingBox(currentMultiCacheInfo.arrayAccessInfo.activeBlock);
                     assert(succeeded(unionResult));
                     matchingExistingInfoIter->arrayAccessInfo.activeBlock.cst.removeRedundantConstraints();
                 }
@@ -6743,12 +6745,23 @@ LogicalResult HoistScalingToCacheReduceRewrite::matchAndRewrite(mlir::AffineStor
     Operation* targetCacheReduceOpOperation = nullptr;
     for (auto& cacheReduceOp : activeBlockCacheReduceOps)
     {
-        auto cacheReduceBlock = cacheReduceOp->getBlock();
-        auto ancestorOp = cacheReduceBlock->findAncestorOpInBlock(*affineStoreOp.getOperation());
-        if (ancestorOp)
+        // If the cache reduce op is still inside of an un-expanded loopnest, then wait for that loopnest to expand before attempting to hoist scales
+        if (cacheReduceOp->getParentOfType<KernelOp>())
         {
-            assert(targetCacheReduceOpOperation == nullptr); // Only expect one cache reduce op to be a candidate
-            targetCacheReduceOpOperation = cacheReduceOp;
+            return failure();
+        }
+
+        // Cache reduce ops are inside of lambdas to control for GPU block/thread shifting of the cache
+        // So find the parent LambdaOp and check that LambdaOp's parent block
+        if (auto parentLambdaOp = cacheReduceOp->getParentOfType<ValueLambdaOp>())
+        {
+            auto parentBlock = parentLambdaOp->getBlock();
+            auto ancestorOp = parentBlock->findAncestorOpInBlock(*affineStoreOp.getOperation());
+            if (ancestorOp)
+            {
+                assert(targetCacheReduceOpOperation == nullptr); // Only expect one cache reduce op to be a candidate
+                targetCacheReduceOpOperation = cacheReduceOp;
+            }
         }
     }
     for (auto& cacheReduceOp : activeElementCacheReduceOps)
diff --git a/accera/transforms/src/gpu/ConvertLaunchFuncToVulkanCalls.cpp b/accera/transforms/src/gpu/ConvertLaunchFuncToVulkanCalls.cpp
index 159c5a9e..0fbc008b 100644
--- a/accera/transforms/src/gpu/ConvertLaunchFuncToVulkanCalls.cpp
+++ b/accera/transforms/src/gpu/ConvertLaunchFuncToVulkanCalls.cpp
@@ -56,8 +56,8 @@ static constexpr const char* kVulkanLaunch = "vulkanLaunch";
 // Custom to acc-vulkan-runtime-wrappers
 static constexpr const char* kSetRepeatedRunCharacteristics = "setRepeatedRunCharacteristics";
 static constexpr const char* kVulkanRuntimeInstanceHandle = "VulkanRuntime_Instance_Handle";
-static constexpr const char* kInitVulkanEmittedFunc = "initVulkanUtilities";
-static constexpr const char* kDestroyVulkanEmittedFunc = "destroyVulkanUtilities";
+[[maybe_unused]] static constexpr const char* kInitVulkanEmittedFunc = "initVulkanUtilities";
+[[maybe_unused]] static constexpr const char* kDestroyVulkanEmittedFunc = "destroyVulkanUtilities";
 static constexpr const char* kInitializeFuncAttrName = "rc_gpu_init";
 static constexpr const char* kDeInitializeFuncAttrName = "rc_gpu_deinit";
 static constexpr const char* kVulkanRuntimeHandleAccessor = "getVulkanRuntimeInstance";
diff --git a/accera/transforms/src/nest/LoopNestToValueFunc.cpp b/accera/transforms/src/nest/LoopNestToValueFunc.cpp
index bb258ad5..779f8e52 100644
--- a/accera/transforms/src/nest/LoopNestToValueFunc.cpp
+++ b/accera/transforms/src/nest/LoopNestToValueFunc.cpp
@@ -221,16 +221,16 @@ struct LoopNestToValueFuncPass : public accera::transforms::LoopNestToValueFuncB
 
             {
                 RewritePatternSet patterns(context);
-                xptr::populateExecutionPlanScaleHoistingPatterns(patterns);
+                utilir::FillCanonicalPatternsRecursively(vFuncOp, patterns);
                 (void)applyPatternsAndFoldGreedily(vFuncOp, std::move(patterns));
-                snapshotter.Snapshot("ExecutionPlanScaleHoisting", vFuncOp);
+                snapshotter.Snapshot("Canonicalize", vFuncOp);
             }
 
             {
                 RewritePatternSet patterns(context);
-                utilir::FillCanonicalPatternsRecursively(vFuncOp, patterns);
+                xptr::populateExecutionPlanScaleHoistingPatterns(patterns);
                 (void)applyPatternsAndFoldGreedily(vFuncOp, std::move(patterns));
-                snapshotter.Snapshot("Canonicalize", vFuncOp);
+                snapshotter.Snapshot("ExecutionPlanScaleHoisting", vFuncOp);
             }
 
             {
diff --git a/accera/transforms/src/util/RangeValueUtilities.cpp b/accera/transforms/src/util/RangeValueUtilities.cpp
index b328e1f8..d8b25b89 100644
--- a/accera/transforms/src/util/RangeValueUtilities.cpp
+++ b/accera/transforms/src/util/RangeValueUtilities.cpp
@@ -390,11 +390,10 @@ RangeValue RangeValueAnalysis::resolveRangeValue(AffineApplyOp op)
                 llvmBinOp = Instruction::BinaryOps::SDiv;
                 break;
             case mlir::AffineExprKind::CeilDiv:
-                assert(false); // Unsupported currently - no matching llvm bin op
-                break;
+                // Unsupported currently - no matching llvm bin op
+                throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, "CeilDiv is not implemented");
             default:
-                assert(false);
-                break;
+                throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, "Unsupported binary op expression");
             }
             llvm::SmallVector<RangeValue, 2> operandRanges{ lhsRv, rhsRv };
             auto rv = resolveRangeValue(llvmBinOp, operandRanges);
diff --git a/accera/transforms/src/value/ValueFuncToTargetPass.cpp b/accera/transforms/src/value/ValueFuncToTargetPass.cpp
index 1933dd3a..5c079a1a 100644
--- a/accera/transforms/src/value/ValueFuncToTargetPass.cpp
+++ b/accera/transforms/src/value/ValueFuncToTargetPass.cpp
@@ -93,9 +93,6 @@ void mapValueTypeAttr(OpT& op, mlir::BlockAndValueMapping& mapping)
     });
 }
 
-constexpr auto kDefaultExecutionTarget = vir::ExecutionTarget::CPU;
-constexpr size_t kLaunchConfigNumDims = 6;
-
 struct ValueFuncToTargetPass : public tr::ValueFuncToTargetBase<ValueFuncToTargetPass>
 {
     ValueFuncToTargetPass(const tr::IntraPassSnapshotOptions& options = {}) :
diff --git a/accera/transforms/src/value/ValueToLLVMLoweringPass.cpp b/accera/transforms/src/value/ValueToLLVMLoweringPass.cpp
index 4f5e4025..f163bb4b 100644
--- a/accera/transforms/src/value/ValueToLLVMLoweringPass.cpp
+++ b/accera/transforms/src/value/ValueToLLVMLoweringPass.cpp
@@ -677,7 +677,7 @@ struct VpmaddwdOpLowering : public ValueLLVMOpConversionPattern<vpmaddwd>
         LLVMTypeConverter llvmTypeConverter(rewriter.getContext());
         auto outputVecType = op.getType().cast<mlir::VectorType>();
         auto outputVecLLVMType = llvmTypeConverter.convertType(outputVecType);
-        auto outputRank = outputVecType.getRank();
+        [[maybe_unused]] auto outputRank = outputVecType.getRank();
         assert(outputRank == 1 && "Vpmaddwd op should have a 1-D result");
         auto elementCount = outputVecType.getShape()[0];
         auto avx512Support = util::ModuleSupportsTargetDeviceFeature(op, "avx512");
@@ -1129,6 +1129,221 @@ struct RawPointerAPIUnusedUndefRemoval : public OpRewritePattern<LLVM::UndefOp>
     }
 };
 
+static OpFoldResult getExpandedDimSize(
+    OpBuilder &builder, 
+    Location loc, 
+    Type &llvmIndexType,
+    int64_t outDimIndex, ArrayRef<int64_t> outStaticShape,
+    MemRefDescriptor &inDesc,
+    ArrayRef<int64_t> inStaticShape,
+    ArrayRef<ReassociationIndices> reassocation,
+    DenseMap<int64_t, int64_t> &outDimToInDimMap) 
+{
+    int64_t outDimSize = outStaticShape[outDimIndex];
+    if (!ShapedType::isDynamic(outDimSize))
+    {
+        return builder.getIndexAttr(outDimSize);
+    }
+
+    // Calculate the multiplication of all the out dim sizes except the current dim.
+    int64_t inDimIndex = outDimToInDimMap[outDimIndex];
+    int64_t otherDimSizesMul = 1;
+
+    for (auto otherDimIndex : reassocation[inDimIndex]) 
+    {
+        if (otherDimIndex == static_cast<unsigned>(outDimIndex))
+        {
+            continue;
+        }
+        otherDimSizesMul *= outStaticShape[otherDimIndex];
+    }
+
+    // outDimSize = inDimSize / otherOutDimSizesMul
+    int64_t inDimSize = inStaticShape[inDimIndex];
+    Value inDimSizeDynamic =
+        ShapedType::isDynamic(inDimSize)
+            ? inDesc.size(builder, loc, inDimIndex)
+            : builder.create<LLVM::ConstantOp>(loc, llvmIndexType, builder.getIndexAttr(inDimSize));
+
+    Value outDimSizeDynamic = builder.create<LLVM::SDivOp>(
+        loc, 
+        inDimSizeDynamic,
+        builder.create<LLVM::ConstantOp>(loc, llvmIndexType, builder.getIndexAttr(otherDimSizesMul)));
+
+    return outDimSizeDynamic;
+}
+
+// Compute a map that for a given dimension of the expanded type gives the
+// dimension in the collapsed type it maps to. Essentially its the inverse of the `reassocation` maps.
+static DenseMap<int64_t, int64_t> getExpandedDimToOriginalDimMap(ArrayRef<ReassociationIndices> reassociation) 
+{
+    llvm::DenseMap<int64_t, int64_t> dimMap;
+    for (auto &dimArray : enumerate(reassociation)) 
+    {
+        for (auto dim : dimArray.value())
+        {
+            dimMap[dim] = dimArray.index();
+        }
+    }
+    return dimMap;
+}
+
+static SmallVector<OpFoldResult, 4> getExpandedShape(
+    OpBuilder &builder, 
+    Location loc, 
+    Type &llvmIndexType,
+    ArrayRef<ReassociationIndices> reassociation,
+    ArrayRef<int64_t> inStaticShape,
+    MemRefDescriptor &inDesc,
+    ArrayRef<int64_t> outStaticShape) 
+{
+    DenseMap<int64_t, int64_t> outDimToInDimMap = getExpandedDimToOriginalDimMap(reassociation);
+    return llvm::to_vector<4>(llvm::map_range(
+        llvm::seq<int64_t>(0, outStaticShape.size()), [&](int64_t outDimIndex) {
+            return getExpandedDimSize(
+                builder, 
+                loc, 
+                llvmIndexType, 
+                outDimIndex, 
+                outStaticShape, 
+                inDesc, 
+                inStaticShape,
+                reassociation, 
+                outDimToInDimMap);
+    }));
+}
+
+/// Helper function to convert a vector of `OpFoldResult`s into a vector of `Value`s.
+static SmallVector<Value> getResultAsValues(
+    OpBuilder &builder, 
+    Location loc,
+    Type &llvmIndexType,
+    ArrayRef<OpFoldResult> valueOrAttrVec) 
+{
+    return llvm::to_vector<4>(
+        llvm::map_range(valueOrAttrVec, [&](OpFoldResult value) -> Value {
+        if (auto attr = value.dyn_cast<Attribute>())
+        {
+            return builder.create<LLVM::ConstantOp>(loc, llvmIndexType, attr);
+        }
+        return value.get<Value>();
+    }));
+}
+
+static SmallVector<Value> getDynamicExpandedShape(
+    OpBuilder &builder, 
+    Location loc, 
+    Type &llvmIndexType,
+    ArrayRef<ReassociationIndices> reassociation,
+    ArrayRef<int64_t> inStaticShape, 
+    MemRefDescriptor &inDesc,
+    ArrayRef<int64_t> outStaticShape) 
+{
+    return getResultAsValues(
+        builder, 
+        loc, 
+        llvmIndexType,
+        getExpandedShape(
+            builder, 
+            loc, 
+            llvmIndexType,
+            reassociation, 
+            inStaticShape,
+            inDesc, 
+            outStaticShape));
+}
+
+bool isStrideOrOffsetStatic(int64_t strideOrOffset) 
+{
+    return !ShapedType::isDynamicStrideOrOffset(strideOrOffset);
+}
+
+struct ExpandShapeOpLowering : public ConvertOpToLLVMPattern<memref::ExpandShapeOp> 
+{
+public:
+    using ConvertOpToLLVMPattern<memref::ExpandShapeOp>::ConvertOpToLLVMPattern;
+    using ReshapeOpAdaptor = typename memref::ExpandShapeOp::Adaptor;
+
+    LogicalResult matchAndRewrite(
+        memref::ExpandShapeOp reshapeOp, 
+        ReshapeOpAdaptor adaptor,
+        ConversionPatternRewriter &rewriter) const override 
+    {
+        MemRefType dstType = reshapeOp.getResultType();
+        MemRefType srcType = reshapeOp.getSrcType();
+
+        int64_t offset;
+        SmallVector<int64_t, 4> strides;
+        if (failed(getStridesAndOffset(dstType, strides, offset))) 
+        {
+            return rewriter.notifyMatchFailure(reshapeOp, "failed to get stride and offset exprs");
+        }
+        
+        MemRefDescriptor srcDesc(adaptor.src());
+        Location loc = reshapeOp->getLoc();
+        auto dstDesc = MemRefDescriptor::undef(rewriter, loc, this->typeConverter->convertType(dstType));
+
+        dstDesc.setAllocatedPtr(rewriter, loc, srcDesc.allocatedPtr(rewriter, loc));
+        dstDesc.setAlignedPtr(rewriter, loc, srcDesc.alignedPtr(rewriter, loc));
+        dstDesc.setOffset(rewriter, loc, srcDesc.offset(rewriter, loc));
+
+        ArrayRef<int64_t> srcStaticShape = srcType.getShape();
+        ArrayRef<int64_t> dstStaticShape = dstType.getShape();
+        Type llvmIndexType = this->typeConverter->convertType(rewriter.getIndexType());
+
+        SmallVector<Value> dstShape = getDynamicExpandedShape(
+            rewriter, 
+            loc, 
+            llvmIndexType, 
+            reshapeOp.getReassociationIndices(),
+            srcStaticShape, 
+            srcDesc, 
+            dstStaticShape);
+
+        for (auto &shape : llvm::enumerate(dstShape))
+        {
+            dstDesc.setSize(rewriter, loc, shape.index(), shape.value());
+        }
+
+        if (llvm::all_of(strides, isStrideOrOffsetStatic)) 
+        {
+            for (auto &stride : llvm::enumerate(strides))
+            {
+                dstDesc.setConstantStride(rewriter, loc, stride.index(), stride.value());
+            }
+        }
+        else if (srcType.getLayout().isIdentity() && dstType.getLayout().isIdentity()) 
+        {
+            Value stride = rewriter.create<LLVM::ConstantOp>(loc, llvmIndexType, rewriter.getIndexAttr(1));
+            for (auto dimIndex : llvm::reverse(llvm::seq<int64_t>(0, dstShape.size()))) 
+            {
+                dstDesc.setStride(rewriter, loc, dimIndex, stride);
+                stride = rewriter.create<LLVM::MulOp>(loc, dstShape[dimIndex], stride);
+            }
+        } 
+        else 
+        {
+            // There could be mixed static/dynamic strides. For simplicity, we
+            // recompute all strides if there is at least one dynamic stride.
+            // See comments for computeExpandedLayoutMap in llvm source code 
+            // for details on how the strides are calculated.
+            for (auto &dimArray : llvm::enumerate(reshapeOp.getReassociationIndices())) 
+            {
+                auto currentStrideToExpand = srcDesc.stride(rewriter, loc, dimArray.index());
+                for (auto dstIndex : llvm::reverse(dimArray.value())) 
+                {
+                    dstDesc.setStride(rewriter, loc, dstIndex, currentStrideToExpand);
+                    Value size = dstDesc.size(rewriter, loc, dstIndex);
+                    currentStrideToExpand = rewriter.create<LLVM::MulOp>(loc, size, currentStrideToExpand);
+                }
+            }
+
+        }
+        rewriter.replaceOp(reshapeOp, {dstDesc});
+        return success();
+    }
+};
+
 } // namespace
 
 using namespace accera::transforms::value;
@@ -1569,6 +1784,7 @@ void ValueToLLVMLoweringPass::runOnModule()
         RewritePatternSet patterns(&getContext());
 
         populateValueToLLVMMemPatterns(llvmTypeConverter, patterns);
+        populateReshapeOpToLLVMMemPatterns(llvmTypeConverter, patterns);
         populateMathToLLVMConversionPatterns(llvmTypeConverter, patterns);
         populateMemRefToLLVMConversionPatterns(llvmTypeConverter, patterns);
         populateStdToLLVMConversionPatterns(llvmTypeConverter, patterns);
@@ -1662,6 +1878,11 @@ void populateValueToLLVMMemPatterns(mlir::LLVMTypeConverter& typeConverter, mlir
     patterns.insert<ValueMemRefCastOpLowering>(typeConverter, context);
 }
 
+void populateReshapeOpToLLVMMemPatterns(mlir::LLVMTypeConverter& typeConverter, mlir::RewritePatternSet& patterns)
+{
+    patterns.insert<ExpandShapeOpLowering>(typeConverter);
+}
+
 const mlir::LowerToLLVMOptions& GetDefaultAcceraLLVMOptions(mlir::MLIRContext* context)
 {
     static LowerToLLVMOptions options(context); // statically allocated default we hand out copies to
diff --git a/accera/transforms/src/value/ValueToStandardLoweringPass.cpp b/accera/transforms/src/value/ValueToStandardLoweringPass.cpp
index 40312d54..c8d8f0a8 100644
--- a/accera/transforms/src/value/ValueToStandardLoweringPass.cpp
+++ b/accera/transforms/src/value/ValueToStandardLoweringPass.cpp
@@ -30,6 +30,7 @@
 #include <mlir/Dialect/StandardOps/Transforms/Passes.h>
 #include <mlir/Dialect/Vector/IR/VectorOps.h>
 #include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dominance.h>
 #include <mlir/IR/Types.h>
 #include <mlir/Pass/Pass.h>
 #include <mlir/Pass/PassManager.h>
@@ -469,6 +470,46 @@ struct PrintProfileResultsOpLowering : public OpRewritePattern<PrintProfileResul
 using ValueAllocOp = vir::AllocOp;
 struct AllocOpLowering : public OpRewritePattern<ValueAllocOp>
 {
+    mlir::OpBuilder::InsertPoint GetPreferredAllocInsertionPoint(ValueAllocOp op) const
+    {
+        auto operands = op.getOperation()->getOperands();
+        // Find the operand which is dominated by all the other operands and the parentFuncOp
+        // This is therefore the last / deepest operand and the alloc should occur after it
+        auto parentFuncOp = op->getParentOfType<mlir::FuncOp>();
+        DominanceInfo domInfo(parentFuncOp);
+        mlir::Operation* currentLeastDominantOperation = parentFuncOp;
+        bool insertInsideOperandBlock = true;
+        for (auto operand : operands)
+        {
+            mlir::Operation* currentOp;
+            bool currentOpIsBlockArg = operand.isa<mlir::BlockArgument>();
+            if (currentOpIsBlockArg)
+            {
+                auto blockArg = operand.cast<mlir::BlockArgument>();
+                currentOp = blockArg.getOwner()->getParentOp();
+            }
+            else
+            {
+                currentOp = operand.getDefiningOp();
+            }
+            if (domInfo.dominates(currentLeastDominantOperation, currentOp))
+            {
+                currentLeastDominantOperation = currentOp;
+                insertInsideOperandBlock = currentOpIsBlockArg;
+            }
+        }
+        if (insertInsideOperandBlock)
+        {
+            auto& block = currentLeastDominantOperation->getRegion(0).front();
+            return mlir::OpBuilder::InsertPoint(&block, block.begin());
+        }
+        else
+        {
+            auto block = currentLeastDominantOperation->getBlock();
+            return mlir::OpBuilder::InsertPoint(block, ++mlir::Block::iterator(currentLeastDominantOperation));
+        }
+    }
+
     using OpRewritePattern::OpRewritePattern;
     LogicalResult matchAndRewrite(ValueAllocOp op,
                                   PatternRewriter& rewriter) const final
@@ -483,34 +524,31 @@ struct AllocOpLowering : public OpRewritePattern<ValueAllocOp>
 
             auto memrefType = op.getType();
             auto allocType = op.allocType().getValueOr(vir::MemoryAllocType::Global);
+            if (memrefType.getNumDynamicDims() != 0)
+            {
+                // dynamic allocations must all be heap allocations
+                allocType = vir::MemoryAllocType::Heap;
+            }
 
             OpBuilder::InsertionGuard guard(rewriter);
-            auto parentFuncOp = op->getParentOfType<mlir::FuncOp>();
+            auto insertPoint = GetPreferredAllocInsertionPoint(op);
+
             mlir::memref::AllocOp allocOp;
             mlir::Block* parentBlock;
             mlir::Value allocatedMemref;
             switch (allocType)
             {
             case vir::MemoryAllocType::Global: {
-                if (memrefType.getNumDynamicDims() == 0)
-                {
-                    auto globalOp = irutil::CreateGlobalBufferOp(rewriter, op, MemRefType::Builder{ memrefType }.setLayout({}), kGlobalOpSymNameFormat);
-                    rewriter.replaceOpWithNewOp<vir::ReferenceGlobalOp>(op, memrefType, globalOp.sym_name());
-                }
-                else
-                {
-                    rewriter.replaceOpWithNewOp<memref::AllocOp>(op, memrefType, op.getOperation()->getOperands(), op.alignmentAttr());
-                }
+                auto globalOp = irutil::CreateGlobalBufferOp(rewriter, op, MemRefType::Builder{ memrefType }.setLayout({}), kGlobalOpSymNameFormat);
+                rewriter.replaceOpWithNewOp<vir::ReferenceGlobalOp>(op, memrefType, globalOp.sym_name());
             }
             break;
             case vir::MemoryAllocType::Stack:
-                // Create the stack allocation at the beginning of the function
-                rewriter.setInsertionPointToStart(&parentFuncOp.front());
+                rewriter.restoreInsertionPoint(insertPoint);
                 rewriter.replaceOpWithNewOp<memref::AllocaOp>(op, MemRefType::Builder{ memrefType }.setLayout({}), mlir::ValueRange{}, op.alignmentAttr());
                 break;
             case vir::MemoryAllocType::Heap:
-                // Create the heap allocation at the beginning of the function
-                rewriter.setInsertionPointToStart(&parentFuncOp.front());
+                rewriter.restoreInsertionPoint(insertPoint);
                 allocOp = rewriter.replaceOpWithNewOp<memref::AllocOp>(op, memrefType, op.getOperation()->getOperands(), op.alignmentAttr());
 
                 // Create a dealloc op at the end of the block containing this alloc op
@@ -563,8 +601,8 @@ struct CastOpLowering : public OpRewritePattern<ValueCastOp>
         auto fromType = op.source().getType();
         auto toType = op.result().getType();
 
-        auto isFromTypeVector = fromType.isa<mlir::VectorType>();
-        auto isToTypeVector = toType.isa<mlir::VectorType>();
+        [[maybe_unused]] auto isFromTypeVector = fromType.isa<mlir::VectorType>();
+        [[maybe_unused]] auto isToTypeVector = toType.isa<mlir::VectorType>();
         assert(isFromTypeVector == isToTypeVector && "Can only cast vectors to vectors or scalars to scalars");
 
         auto fromElementType = util::GetElementType(fromType);
@@ -2520,7 +2558,7 @@ LogicalResult EnterProfileRegionOpLowering::matchAndRewrite(EnterProfileRegionOp
 
     auto millisecondsInSecond = rewriter.create<arith::ConstantOp>(loc, util::GetValAttr(rewriter, currentTime.getType(), 1000));
     mlir::Value newCurrentTime = rewriter.create<vir::BinOp>(loc, vir::BinaryOpPredicate::MUL, currentTime, millisecondsInSecond);
-    
+
     rewriter.create<vir::CopyOp>(loc, newCurrentTime, startTimeRef);
     rewriter.eraseOp(op);
     return success();
@@ -2728,7 +2766,7 @@ LogicalResult VhaddLowering::matchAndRewrite(ValueVHADDOp op, PatternRewriter& r
     auto rhs = op.rhs();
 
     auto vecType = lhs.getType().cast<mlir::VectorType>();
-    auto rank = vecType.getRank();
+    [[maybe_unused]] auto rank = vecType.getRank();
     assert(rank == 1 && "vhadd only supports rank-1 vectors");
     auto elementType = vecType.getElementType();
     auto elementCount = vecType.getNumElements();
@@ -2911,8 +2949,8 @@ void populateValueToStandardPatterns(bool enableProfiling, ProfileRegions& profi
         VhaddLowering>(context);
 
     patterns.insert<EnterProfileRegionOpLowering,
-                PrintProfileResultsOpLowering,
-                ExitProfileRegionOpLowering>(context, enableProfiling, profileRegions);
+                    PrintProfileResultsOpLowering,
+                    ExitProfileRegionOpLowering>(context, enableProfiling, profileRegions);
 }
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createValueToStdPass(bool enableProfiling)
diff --git a/accera/transforms/src/vectorization/VectorizationUtil.cpp b/accera/transforms/src/vectorization/VectorizationUtil.cpp
index 51adc344..9976702d 100644
--- a/accera/transforms/src/vectorization/VectorizationUtil.cpp
+++ b/accera/transforms/src/vectorization/VectorizationUtil.cpp
@@ -2885,7 +2885,7 @@ mlir::LogicalResult vectorizeInt16MatMul(mlir::AffineForOp affineForOp,
     // 6. add C + (A * B)
     if (innerLoopBodyIter == innerLoopBodyEnd || !isa<v::BinOp>(*innerLoopBodyIter))
     {
-        return reportMatchFailure(affineForOp, "Failed to match the binary add op");
+        return reportMatchFailure(affineForOp, "Failed to match the binary op");
     }
     auto accOp = cast<v::BinOp>(*innerLoopBodyIter++);
     if (accOp.predicate() != v::BinaryOpPredicate::ADD)
@@ -3194,7 +3194,7 @@ mlir::LogicalResult vectorizeMaskedLoadStore(mlir::AffineForOp loopOp,
         loadVal = thenCastOp.result();
     }
 
-    // match second load op for accumulation case
+    // match second load op for binop case
 
     mlir::AffineLoadOp loadOp2;
     mlir::Value loadVal2;
@@ -3216,32 +3216,42 @@ mlir::LogicalResult vectorizeMaskedLoadStore(mlir::AffineForOp loopOp,
         loadVal2 = thenCastOp2.result();
     }
 
-    // binary add op for accumulation case
-    v::BinOp accOp;
+    // optional binary op
+    v::BinOp binOp;
     if (thenOpsIter != thenOpsEnd && isa<v::BinOp>(thenOpsIter))
     {
-        accOp = cast<v::BinOp>(thenOpsIter++);
+        binOp = cast<v::BinOp>(thenOpsIter++);
     }
 
-    // Check that the operands for the accumulation op are in fact the values from load ops
-    mlir::Value accVal;
-    if (accOp)
+    // Check that the operands for the bin op are in fact the values from load ops
+    mlir::Value binOpVal;
+    if (binOp)
     {
-        if (!((accOp.lhs() == loadVal && accOp.rhs() == loadVal2) || (accOp.rhs() == loadVal && accOp.lhs() == loadVal2)))
+        if (!loadVal2)
         {
-            return reportMatchFailure(accOp, "Failed to match the accumulation operands");
+            // Check if one of the operands to the bin op is a constant
+            auto otherVal = binOp.lhs() == loadVal ? binOp.rhs() : binOp.lhs();
+            if (otherVal.getDefiningOp<mlir::arith::ConstantOp>())
+            {
+                loadVal2 = otherVal;
+            }
         }
-        matchedOps.push(accOp);
-        accVal = accOp.getResult();
+
+        if (!((binOp.lhs() == loadVal && binOp.rhs() == loadVal2) || (binOp.rhs() == loadVal && binOp.lhs() == loadVal2)))
+        {
+            return reportMatchFailure(binOp, "Failed to match the binOp operands");
+        }
+        matchedOps.push(binOp);
+        binOpVal = binOp.getResult();
     }
 
-    // optionally check if there is a cast op after accumulation op
+    // optionally check if there is a cast op after bin op
     v::CastOp thenCastOp3;
     if (thenOpsIter != thenOpsEnd && isa<v::CastOp>(thenOpsIter))
     {
         thenCastOp3 = cast<v::CastOp>(thenOpsIter++);
         matchedOps.push(thenCastOp3);
-        accVal = thenCastOp3.result();
+        binOpVal = thenCastOp3.result();
     }
 
     // store op
@@ -3390,21 +3400,37 @@ mlir::LogicalResult vectorizeMaskedLoadStore(mlir::AffineForOp loopOp,
         std::vector<mlir::Value> indices2(adaptor2.indices().begin(), adaptor2.indices().end());
         auto [flatCastMemref2, flattenedPosition2] = FlattenAccess(rewriter, loadOp2, indices2);
 
-        mlir::Value accumulateOperand2 = rewriter.create<mlir::vector::TransferReadOp>(loadLoc2, loadVectorType2, flatCastMemref2, mlir::ValueRange{ flattenedPosition2 }, permutationMap, finalPaddingOpValue2, mask, inbounds);
+        mlir::Value binOperand2 = rewriter.create<mlir::vector::TransferReadOp>(loadLoc2, loadVectorType2, flatCastMemref2, mlir::ValueRange{ flattenedPosition2 }, permutationMap, finalPaddingOpValue2, mask, inbounds);
 
         // optional cast op after second vector transfer read op
         if (thenCastOp2) // then cast op
         {
             // Create a cast to hold vector of values
             auto castVecType2 = mlir::VectorType::get({ unrollMax }, thenCastOp2.getType());
-            accumulateOperand2 = rewriter.create<v::CastOp>(loopOp.getLoc(), accumulateOperand2, castVecType2);
+            binOperand2 = rewriter.create<v::CastOp>(loopOp.getLoc(), binOperand2, castVecType2);
         }
 
-        // if there is a second masked load, accumulation operator must follow before final store
-        // create binary add op to accumulate results from first and second masked load ops
-        valueToStore = rewriter.create<v::BinOp>(accOp.getLoc(), v::BinaryOpPredicate::ADD, valueToStore, accumulateOperand2);
+        // if there is a second masked load, bin operator must follow before final store
+        // create binary op to combine results from first and second masked load ops
+        valueToStore = rewriter.create<v::BinOp>(binOp.getLoc(), binOp.getPredicate(), valueToStore, binOperand2);
+
+        // optional cast op after bin op
+        if (thenCastOp3) // then cast op
+        {
+            // Create a cast to hold vector of values
+            auto castVecType3 = mlir::VectorType::get({ unrollMax }, thenCastOp3.getType());
+            valueToStore = rewriter.create<v::CastOp>(loopOp.getLoc(), valueToStore, castVecType3);
+        }
+    }
+    else if (loadVal2 && loadVal2.getDefiningOp<mlir::arith::ConstantOp>())
+    {
+        auto constantOp = loadVal2.getDefiningOp<mlir::arith::ConstantOp>();
+        // If the second operand to the binop is a constant, broadcast to a vector and replicate the bin op
+        auto vectorType = mlir::VectorType::get({ unrollMax }, constantOp.getType());
+        auto broadcastConstantOp = rewriter.create<mlir::vector::BroadcastOp>(constantOp.getLoc(), vectorType, constantOp);
+        valueToStore = rewriter.create<v::BinOp>(binOp.getLoc(), binOp.getPredicate(), valueToStore, broadcastConstantOp);
 
-        // optional cast op after accumulation op
+        // optional cast op after bin op
         if (thenCastOp3) // then cast op
         {
             // Create a cast to hold vector of values
diff --git a/accera/utilities/test/src/MemoryLayout_test.cpp b/accera/utilities/test/src/MemoryLayout_test.cpp
index c197edae..b125f9f0 100644
--- a/accera/utilities/test/src/MemoryLayout_test.cpp
+++ b/accera/utilities/test/src/MemoryLayout_test.cpp
@@ -92,7 +92,9 @@ TEST_CASE("TestMemoryLayoutCtors")
 TEST_CASE("TestMemoryLayoutSlice")
 {
     constexpr int64_t rows = 3, columns = 5, channels = 7, outerExtent = 4;
-    auto physicalSize = GENERATE_COPY(chunk(GENERATE(range(1, 4)), values({ rows, columns, channels, outerExtent })));
+    // BUGBUG: causes infinite loop in Catch2 3.3.1
+    // auto physicalSize = GENERATE_COPY(chunk(GENERATE(range(1, 4)), values({ rows, columns, channels, outerExtent })));
+    auto physicalSize = GENERATE_COPY(chunk(2, values({ rows, columns, channels, outerExtent })));
 
     std::vector<int64_t> order(physicalSize.size());
     std::iota(order.begin(), order.end(), 0);
@@ -107,17 +109,10 @@ TEST_CASE("TestMemoryLayoutSlice")
         auto sliced = layout.GetSliceLayout(sliceDimension);
 
         CHECK(sliced.NumDimensions() == (layout.NumDimensions() - 1));
-        CHECKED_IF(sliceDimension == 0)
-        {
-            CHECK(sliced.NumElements() == (layout.NumElements() / layout.GetExtent(0)));
-        }
-        CHECKED_ELSE(sliceDimension == 0)
-        {
-            CHECK(sliced.NumElements() == (layout.NumElements() / layout.GetExtent(0)));
-        }
+        CHECK(sliced.NumElements() == (layout.NumElements() / layout.GetExtent(sliceDimension)));
 
         auto slicedNumDimensions = sliced.NumDimensions();
-        CHECKED_ELSE(slicedNumDimensions == 0)
+        CHECKED_IF(slicedNumDimensions == 0)
         {
             auto dimension = GENERATE_COPY(range(zero, slicedNumDimensions));
             CHECKED_IF(dimension < sliceDimension)
diff --git a/accera/value/include/Scalar.h b/accera/value/include/Scalar.h
index ad57277b..5498b602 100644
--- a/accera/value/include/Scalar.h
+++ b/accera/value/include/Scalar.h
@@ -46,7 +46,7 @@ namespace value
         Scalar(Scalar&&) noexcept;
         Scalar& operator=(const Scalar&);
         Scalar& operator=(Scalar&&) noexcept;
-        ~Scalar();
+        virtual ~Scalar();
 
         /// <summary> Gets the underlying wrapped Value instance </summary>
         Value GetValue() const;
diff --git a/accera/value/include/ScalarDimension.h b/accera/value/include/ScalarDimension.h
index de8a225e..14db7bfa 100644
--- a/accera/value/include/ScalarDimension.h
+++ b/accera/value/include/ScalarDimension.h
@@ -17,6 +17,7 @@ namespace value
         ScalarDimension(Role role = Role::Input);
         ScalarDimension(const std::string& name, Role role = Role::Input);
         ScalarDimension(Value value, const std::string& name = "", Role role = Role::Input);
+        ~ScalarDimension();
 
         virtual void SetValue(Value value) final;
     };
diff --git a/accera/value/src/MLIREmitterContext.cpp b/accera/value/src/MLIREmitterContext.cpp
index 8e6a15bc..ad3a338a 100644
--- a/accera/value/src/MLIREmitterContext.cpp
+++ b/accera/value/src/MLIREmitterContext.cpp
@@ -1014,7 +1014,7 @@ static accera::ir::value::MemoryAllocType AllocateFlagToAllocateType(accera::val
         MAP_FLAGS(Heap, Heap);
         // MAP_FLAGS(ThreadLocal, ThreadLocal); // Not implemented
     default:
-        assert(false);
+        llvm_unreachable("Unknown allocation flag");
     }
 
 #undef MAP_PREDICATE
@@ -1172,7 +1172,7 @@ EmitterContext::DefinedFunction MLIRContext::CreateFunctionImpl(FunctionDeclarat
     auto isPublic = decl.IsPublic();
     auto funcTarget = decl.Target();
     auto funcRuntime = decl.Runtime();
-    auto isGpu = std::holds_alternative<targets::GPU>(funcTarget);
+    [[maybe_unused]] auto isGpu = std::holds_alternative<targets::GPU>(funcTarget);
 
     const auto& argTypes = decl.GetParameterTypes();
     const auto& returnType = decl.GetReturnType();
@@ -1977,7 +1977,7 @@ Value MLIRContext::ViewImpl(Value sourceValue, const std::vector<Scalar>& offset
     std::vector<mlir::Value> sizes;
     std::vector<mlir::Value> strides;
     auto convertValueToMLIRIndexValue = [&](int64_t sentinelValue) {
-        return [&](Scalar scalarValue) -> mlir::Value {
+        return [&, sentinelValue](Scalar scalarValue) -> mlir::Value {
             auto mlirVal = ToMLIRValue(builder, scalarValue);
             if (auto constantIndex = mlirVal.getDefiningOp<mlir::arith::ConstantIndexOp>())
             {
@@ -2102,6 +2102,18 @@ Value MLIRContext::ReinterpretCastImpl(Value input, ValueType valueType)
         auto outputElementBitwidth = outputMlirElemType.getIntOrFloatBitWidth();
         auto outputElementBytewidth = outputElementBitwidth / 8;
 
+        // Special case where the input element type and target element type have the same bitwidth
+        // Then the layout doesn't change
+        if (inputElementBytewidth == outputElementBytewidth)
+        {
+            mlir::MemRefType::Builder outputTypeBuilder(inputMemrefType);
+            outputTypeBuilder.setElementType(outputMlirElemType);
+            mlir::MemRefType outputMemRefType = outputTypeBuilder;
+            auto returnVal = builder.create<accera::ir::value::MemRefCastOp>(loc, inputMlir, outputMemRefType);
+
+            return Wrap(returnVal, input.GetLayout());
+        }
+
         auto d0 = mlir::getAffineDimExpr(0, mlirCtx);
         auto d1 = mlir::getAffineDimExpr(1, mlirCtx);
 
diff --git a/accera/value/src/ScalarDimension.cpp b/accera/value/src/ScalarDimension.cpp
index 6a6bbdb5..5236f3ec 100644
--- a/accera/value/src/ScalarDimension.cpp
+++ b/accera/value/src/ScalarDimension.cpp
@@ -23,5 +23,8 @@ namespace value
     {
         Scalar::SetValue(value);
     }
+
+    ScalarDimension::~ScalarDimension() = default;
+
 } // namespace value
 } // namespace accera
\ No newline at end of file