From 1127a6040662ed4548aac4b479308a6e793cd071 Mon Sep 17 00:00:00 2001 From: Lisa Ong Date: Fri, 24 Mar 2023 12:03:35 +0800 Subject: [PATCH] Squashed commit of the following: commit 40dffe83929973c8e205c395be5db23c360c2397 Author: Denny Sun Date: Thu Mar 23 05:06:51 2023 +0000 Merged PR 3176: [Accera] split_dim op supports dynamic dims with static split size With this fix the following test case which has dynamic dims with static split size can succeed. ``` M, MN = create_dimensions() N = 16 Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(MN,)) Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(M, N)) nest = Nest(shape=(M, N)) i, j = nest.get_indices() @nest.iteration_logic def _(): split_input = Input._split_dimension(0, cast(16, ScalarType.index)) Output[i, j] = split_input[i, j] ``` commit 451b67405d77ebbe1cf0722f9c7aeb191c3b4beb Author: Mason Remy Date: Thu Mar 23 01:19:37 2023 +0000 Merged PR 3174: Ensure any dynamic allocations are heap allocs that get dealloced Ensure any dynamic allocations are heap allocs commit 602b068f19cdf1ff9111aacbeb5d704974521ebd Author: Kern Handa Date: Wed Mar 22 20:59:43 2023 +0000 Merged PR 3171: [test] Add some tests for Dimensions commit ccd1f5c39964fbe96815672f137b5185ee2e9885 Author: Mason Remy Date: Wed Mar 22 19:41:02 2023 +0000 Merged PR 3175: Support reinterpret cast of same bitwidth without changing layout Support reinterpret cast of same bitwidth without changing layout commit 270a3c8a9c1e1c06b0da3c9d61fa2f04438e3076 Author: Kern Handa Date: Fri Mar 17 22:16:08 2023 +0000 Merged PR 3167: Remove hack to treat INPUT_OUTPUT Arrays with shape (1,) as Elements I don't have complete context on this, so this might break something. If it does, that should be fixed separately rather than keep this hack around, which breaks semantics in non-obvious ways. commit efcff61727c64e7f0a37f4f92c701bc47ea1c470 Author: Lisa Ong Date: Fri Mar 17 08:09:07 2023 +0000 Merged PR 3165: [build] Fix clang 14 release build warnings treated as errors on macOS/Apple Errors are showing up on release builds: ``` cmake .. -DCMAKE_BUILD_TYPE=Release -G Ninja cmake --build . --config Release ``` Clang version: ``` Apple clang version 14.0.0 (clang-1400.0.29.202) Target: arm64-apple-darwin22.3.0 Thread model: posix ``` commit 43f311aa706214243ce8d7acca7d29993bb7003b Author: Lisa Ong Date: Fri Mar 17 07:02:09 2023 +0000 Merged PR 3162: Bump vcpkg to latest release Last release was Sept 2022. Update to the latest tag (2023.02.24) Preparation for LLVM 15 upgrade commit 07098f502596d997bbe241e95f1130c11e318220 Author: Mason Remy Date: Thu Mar 16 23:27:04 2023 +0000 Merged PR 3161: Fix cache reduce scale constant hoisting Fix cache reduce scale constant hoisting commit 696ef0df5947067f94b64255e00b7fffc4c04f9d Author: Mason Remy Date: Thu Mar 16 20:54:22 2023 +0000 Merged PR 3163: Extend vector masked loads/stores to handle arbitrary bin ops and constant operands Extend vector masked loads/stores to handle arbitrary bin ops and constant operands --- CMakeLists.txt | 13 +- accera/hat/include/HATEmitter.h | 6 + accera/ir/include/value/ValueOps.td | 10 +- accera/ir/src/AffineConstraintsHelper.cpp | 2 +- accera/ir/src/TranslateToHeader.cpp | 8 - accera/ir/src/value/ValueDialect.cpp | 6 +- accera/python/accera/lang/Array.py | 22 +- accera/python/accera/test/dsl_tests.py | 49 +- accera/python/accera/test/smoke_tests.py | 495 +++++++++++++----- .../include/value/ValueToLLVMLoweringPass.h | 1 + .../value/ValueToStandardLoweringPass.h | 2 +- .../ExecutionPlanToAffineLoweringPass.cpp | 55 +- .../gpu/ConvertLaunchFuncToVulkanCalls.cpp | 4 +- .../src/nest/LoopNestToValueFunc.cpp | 8 +- .../src/util/RangeValueUtilities.cpp | 7 +- .../src/value/ValueFuncToTargetPass.cpp | 3 - .../src/value/ValueToLLVMLoweringPass.cpp | 223 +++++++- .../src/value/ValueToStandardLoweringPass.cpp | 78 ++- .../src/vectorization/VectorizationUtil.cpp | 66 ++- .../utilities/test/src/MemoryLayout_test.cpp | 15 +- accera/value/include/Scalar.h | 2 +- accera/value/include/ScalarDimension.h | 1 + accera/value/src/MLIREmitterContext.cpp | 18 +- accera/value/src/ScalarDimension.cpp | 3 + 24 files changed, 852 insertions(+), 245 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b20f53a..5695293a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,12 +35,12 @@ option(STRICT_MODE "Build with 'warnings as errors'" OFF) option(USE_MKL "Build with Intel MKL" OFF) option(USE_LIBCXX "Build with libc++ if using the Clang compiler" OFF) -if(CMAKE_CXX_COMPILER_ID STREQUAL Clang) +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang OR CMAKE_CXX_COMPILER_ID STREQUAL AppleClang) if(USE_LIBCXX OR (CMAKE_HOST_SYSTEM_NAME STREQUAL Darwin)) add_compile_options(-stdlib=libc++) link_libraries(-lc++ -lc++abi) endif(USE_LIBCXX OR (CMAKE_HOST_SYSTEM_NAME STREQUAL Darwin)) -endif(CMAKE_CXX_COMPILER_ID STREQUAL Clang) +endif(CMAKE_CXX_COMPILER_ID STREQUAL Clang OR CMAKE_CXX_COMPILER_ID STREQUAL AppleClang) # Try to create a compilation database, which is useful to have when working # with clang tooling @@ -161,10 +161,13 @@ else() set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -ggdb3 -O0") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -ggdb3") set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -ggdb3") - if(${CMAKE_CXX_COMPILER_ID} STREQUAL Clang) + + if(CMAKE_CXX_COMPILER_ID STREQUAL Clang OR CMAKE_CXX_COMPILER_ID STREQUAL AppleClang) if(CMAKE_BUILD_TYPE STREQUAL Debug) - # Set options for Control Flow Integrity - add_compile_options(-fsanitize=cfi) + # Set options for Control Flow Integrity + if(NOT ${OSX_NATIVE_ARCH} STREQUAL "arm64") + add_compile_options(-fsanitize=cfi) + endif() endif(CMAKE_BUILD_TYPE STREQUAL Debug) add_compile_options(-Wno-backslash-newline-escape) diff --git a/accera/hat/include/HATEmitter.h b/accera/hat/include/HATEmitter.h index 916ab167..91779905 100644 --- a/accera/hat/include/HATEmitter.h +++ b/accera/hat/include/HATEmitter.h @@ -294,6 +294,8 @@ class Parameter : public TOMLSerializable return SerializeCommonParameters(); } + virtual ~Parameter() = default; + protected: Parameter(const LogicalParamType& logicalType, const std::string& name, const std::string& description, const UsageType usage, const std::string& declaredType, const std::string& elementType) : _logicalType{ logicalType }, @@ -415,6 +417,8 @@ class Function : public TOMLSerializable _description{ description }, _callingConvention{ callingConvention } {} + virtual ~Function() = default; + std::string Name() const { return _name; } std::string Description() const { return _description; } @@ -1021,6 +1025,8 @@ class ExternalLibraryReference : public TOMLSerializable return table; } + virtual ~ExternalLibraryReference() = default; + private: std::string _name; std::string _version; diff --git a/accera/ir/include/value/ValueOps.td b/accera/ir/include/value/ValueOps.td index b57a37c2..5cb1b057 100644 --- a/accera/ir/include/value/ValueOps.td +++ b/accera/ir/include/value/ValueOps.td @@ -645,9 +645,13 @@ def accv_CallOp : accv_Op<"call", [CallOpInterface]> { $_state.addOperands(operands); $_state.addAttribute("callee", callee); $_state.addTypes(results); - }]>, OpBuilder<(ins "StringRef":$callee, "ArrayRef":$results, CArg<"ValueRange", "{}">:$operands), [{ - build($_builder, $_state, StringAttr::get($_builder.getContext(), callee), results, - operands); + + // BUGBUG: -Werror,-Winfinite-recursion, needed? + // }]>, OpBuilder<(ins "StringRef":$callee, "ArrayRef":$results, CArg<"ValueRange", "{}">:$operands), [{ + // build($_builder, $_state, StringAttr::get($_builder.getContext(), callee), results, + // operands); + // + }]>]; let extraClassDeclaration = [{ diff --git a/accera/ir/src/AffineConstraintsHelper.cpp b/accera/ir/src/AffineConstraintsHelper.cpp index 3f5c1eca..a0153ef6 100644 --- a/accera/ir/src/AffineConstraintsHelper.cpp +++ b/accera/ir/src/AffineConstraintsHelper.cpp @@ -200,7 +200,7 @@ namespace util if (_cst.containsId(val)) { unsigned id = 0; - bool found = _cst.findId(val, &id); + [[maybe_unused]] bool found = _cst.findId(val, &id); assert(found); return IdWrapper::FromFullId(id, _cst); } diff --git a/accera/ir/src/TranslateToHeader.cpp b/accera/ir/src/TranslateToHeader.cpp index 63df849c..9b598f90 100644 --- a/accera/ir/src/TranslateToHeader.cpp +++ b/accera/ir/src/TranslateToHeader.cpp @@ -750,14 +750,6 @@ namespace ir } std::vector shapeVec; std::transform(shape.begin(), shape.end(), std::back_inserter(shapeVec), [](int64_t val) { return static_cast(val); }); - if (usage != hat::UsageType::Input && shapeVec.size() == 1 && shapeVec[0] == 1) - { - // TODO: This is currently a hack since output Dimension does not work. So in the DSL we use Array - // instead and here we emulate an ElementParameter instead. Remove this when output Dimension are working. - assert(declaredType.back() == '*'); - return std::make_unique(name, description, usage, declaredType.substr(0, declaredType.length() - 1), elementType); - } - return std::make_unique(name, description, usage, declaredType, elementType, shapeVec, affineMap, affineOffset); } diff --git a/accera/ir/src/value/ValueDialect.cpp b/accera/ir/src/value/ValueDialect.cpp index 2abd8a22..fc0f8215 100644 --- a/accera/ir/src/value/ValueDialect.cpp +++ b/accera/ir/src/value/ValueDialect.cpp @@ -511,9 +511,9 @@ MemRefType ViewOp::computeMemRefType(Value source, ValueRange sizes, ValueRange auto context = source.getContext(); auto sourceMemRefType = source.getType().cast(); int64_t sourceRank = sourceMemRefType.getRank(); - int64_t numOffsets = static_cast(offsets.size()); - int64_t numSizes = static_cast(sizes.size()); - int64_t numStrides = static_cast(strides.size()); + [[maybe_unused]] int64_t numOffsets = static_cast(offsets.size()); + [[maybe_unused]] int64_t numSizes = static_cast(sizes.size()); + [[maybe_unused]] int64_t numStrides = static_cast(strides.size()); assert(sourceRank == numOffsets); assert(sourceRank == numSizes); assert(sourceRank == numStrides); diff --git a/accera/python/accera/lang/Array.py b/accera/python/accera/lang/Array.py index cd076637..80bdb94b 100644 --- a/accera/python/accera/lang/Array.py +++ b/accera/python/accera/lang/Array.py @@ -11,7 +11,7 @@ from operator import mul from typing import * -from .._lang_python import ScalarType, _MemoryLayout, AllocateFlags, Role +from .._lang_python import ScalarType, _MemoryLayout, AllocateFlags, Role, type_size_bytes from .._lang_python._lang import Array as NativeArray, Dimension from .Layout import Layout, MemoryMapLayout from ..Parameter import DelayedParameter @@ -177,12 +177,15 @@ def _value(self): return None def _reinterpret_cast_internal(self, element_type): - if any(map(lambda d: isinstance(d, Dimension), self.shape)): - expected_layout = [-1] + src_element_size = type_size_bytes(self.element_type) + dst_element_size = type_size_bytes(element_type) + if src_element_size == dst_element_size: + expected_layout = self.shape else: - src_element_size = np.dtype(SCALAR_TYPE_TO_DTYPE_STR[self.element_type]).itemsize - dst_element_size = np.dtype(SCALAR_TYPE_TO_DTYPE_STR[element_type]).itemsize - expected_layout = [int(self._num_elements * (src_element_size / dst_element_size))] + if any(map(lambda d: isinstance(d, Dimension), self.shape)): + expected_layout = [-1] + else: + expected_layout = [int(self._num_elements * (src_element_size / dst_element_size))] reinterpreted = Array(role=self.role, element_type=element_type, shape=expected_layout) return reinterpreted @@ -190,9 +193,12 @@ def _get_memory_buffer(self): return self._reinterpret_cast_internal(ScalarType.uint8) def _reinterpret_cast(self, element_type): - if self.element_type != ScalarType.uint8 or len(self.shape) != 1: + src_bytewidth = type_size_bytes(self.element_type) + dst_bytewidth = type_size_bytes(element_type) + if src_bytewidth != dst_bytewidth and \ + (self.element_type != ScalarType.uint8 or len(self.shape) != 1): raise RuntimeError( - "Can only call reinterpret cast on flat uint8 memory buffers. Call _get_memory_buffer first?" + "Can only call reinterpret cast such that the bitwidth doesn't change, or on flat uint8 memory buffers. Call _get_memory_buffer first?" ) return self._reinterpret_cast_internal(element_type) diff --git a/accera/python/accera/test/dsl_tests.py b/accera/python/accera/test/dsl_tests.py index c632fb1c..11f4462b 100644 --- a/accera/python/accera/test/dsl_tests.py +++ b/accera/python/accera/test/dsl_tests.py @@ -640,6 +640,22 @@ def test_reinterpret_cast(self) -> None: shape=(256, 256), ) + def reinterpret_arr_as_int16(array: Array): + # Assumes array is f32 + + num_elements = reduce(lambda x, y: x*y, array.shape, 1) + arr_mb = array._get_memory_buffer() + self.assertEqual(arr_mb.shape, [num_elements * 4]) + self.assertEqual(arr_mb.element_type, ScalarType.uint8) + print(arr_mb.layout) + + arr_as_int16 = arr_mb._reinterpret_cast(ScalarType.int16) + self.assertEqual(arr_as_int16.shape, [num_elements * 2]) + self.assertEqual(arr_as_int16.element_type, ScalarType.int16) + print(arr_as_int16.layout) + + return arr_as_int16 + def reinterpret_arr_as_int32(array: Array): # Assumes array is f32 @@ -656,9 +672,19 @@ def reinterpret_arr_as_int32(array: Array): return arr_as_int32 + def simple_reinterpret_arr_as_int32(array: Array): + # Assumes array is f32 + + num_elements = reduce(lambda x, y: x*y, array.shape, 1) + arr_as_int32 = array._reinterpret_cast(ScalarType.int32) + self.assertEqual(arr_as_int32.shape, array.shape) + self.assertEqual(arr_as_int32.element_type, ScalarType.int32) + print(arr_as_int32.layout) + + return arr_as_int32 + # add a function that utilizes a subarray layout - def make_reinterpreted_fn(array): - reinterpreted = reinterpret_arr_as_int32(array) + def make_reinterpreted_fn(array, reinterpreted): nest = Nest(shape=reinterpreted.shape) i = nest.get_indices() @@ -668,12 +694,23 @@ def _(): return package.add(nest, args=(reinterpreted, )) - reinterpreted_fn = make_reinterpreted_fn(arr) + reinterpreted_i32 = reinterpret_arr_as_int32(arr) + reinterpreted_i32_fn = make_reinterpreted_fn(arr, reinterpreted_i32) + + reinterpreted_i16 = reinterpret_arr_as_int16(arr) + reinterpreted_i16_fn = make_reinterpreted_fn(arr, reinterpreted_i16) + + simple_reinterpreted_i32 = simple_reinterpret_arr_as_int32(arr) + simple_reinterpreted_i32_fn = make_reinterpreted_fn(arr, simple_reinterpreted_i32) # add a function that instantiates a subarray of the input array and calls the function above def main(array): - reinterpreted_array = reinterpret_arr_as_int32(array) - reinterpreted_fn(reinterpreted_array) + reinterpreted_array_i32 = reinterpret_arr_as_int32(array) + reinterpreted_array_i16 = reinterpret_arr_as_int16(array) + simple_reinterpreted_array_i32 = simple_reinterpret_arr_as_int32(array) + reinterpreted_i32_fn(reinterpreted_array_i32) + reinterpreted_i16_fn(reinterpreted_array_i16) + simple_reinterpreted_i32_fn(simple_reinterpreted_array_i32) package.add(main, args=(arr, )) @@ -1432,7 +1469,7 @@ def _(): } # TODO: Disabling this verification for now, re-enable it when undoing this change. - # self._verify_helper(package, get_size_fn_name, get_size_fn.name, correctness_check_values) + self._verify_helper(package, get_size_fn_name, get_size_fn.name, correctness_check_values) correctness_check_values = { "pre": [size_test, x_ref, start_array_pre_test, delta_test], diff --git a/accera/python/accera/test/smoke_tests.py b/accera/python/accera/test/smoke_tests.py index 5b312e05..d0499b71 100644 --- a/accera/python/accera/test/smoke_tests.py +++ b/accera/python/accera/test/smoke_tests.py @@ -5,17 +5,18 @@ #################################################################################################### import inspect -from itertools import product -import os -import sys -import unittest import logging +import os import pathlib import platform import shutil -import numpy as np +import sys +import unittest +from itertools import product from typing import Callable, List +import numpy as np + try: import cuda except: @@ -48,13 +49,21 @@ "public": False } -from accera import Package, ScalarType, Nest, Array, Constants, Scalar, fuse, create_parameters, cast, Target, Role -from accera._lang_python._lang import _MemorySpace, _MMAShape, Dimension -from accera import min as accmin +from accera._lang_python import _MemoryLayout +from accera._lang_python._lang import Array as NativeArray +from accera._lang_python._lang import Dimension, _MemorySpace, _MMAShape, _If +from accera._lang_python._lang._gpu import Barrier from accera.samples import MatrixMultiplication -from accera.test import verifiers -from accera.test.test_utils import expectedFailure, FailedReason from accera.Targets import KNOWN_DEVICES +from accera.test import verifiers +from accera.test.test_utils import FailedReason, expectedFailure + +from accera import ( + AUTO, AllocateFlags, Array, Constants, Nest, Package, Role, Scalar, ScalarType, Target, cast, create_dimensions, + create_parameters, fuse +) +from accera import min as accmin +from accera import abs as accabs TEST_PACKAGE_DIR = "test_acccgen" @@ -500,6 +509,7 @@ def _(): def test_mlas_matmul(self) -> None: from itertools import combinations_with_replacement + from accera.samples.MatrixMultiplication import MLAS domains = combinations_with_replacement([1, 31, 63, 127], 3) @@ -616,7 +626,7 @@ def _(): v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref)) def _test_fast_exp_mlas(self, func_level_precision: bool): - from accera import fast_exp_mlas, fast_exp + from accera import fast_exp, fast_exp_mlas M = 64 N = 64 @@ -647,9 +657,9 @@ def _(): pkg_opt = Package._Options.NONE if func_level_precision else Package._Options.HIGH_PRECISION_FLOATING_POINT_OPS # Create a package and add our function definition to it - package_name = "test_fast_exp_mlas" + package_name = f"test_fast_exp_mlas_{'func' if func_level_precision else 'pkg'}" package = Package() - function = package.add(plan, args=(In, Out), base_name="test_fast_exp_mlas", function_opts=func_opt) + function = package.add(plan, args=(In, Out), base_name=package_name, function_opts=func_opt) # Build the HAT package with verifiers.VerifyPackage(self, package_name, TEST_PACKAGE_DIR) as v: @@ -679,7 +689,8 @@ def test_fast_exp_mlas_w_pkg_level_precision(self): self._test_fast_exp_mlas(False) def test_emittime_cache_mlas_matmul(self) -> None: - from accera.samples.OfflineCacheMatrixMultiplication import EmitTimeCacheMLAS + from accera.samples.OfflineCacheMatrixMultiplication import \ + EmitTimeCacheMLAS package = Package() M, N, K = [31, 63, 127] @@ -705,7 +716,8 @@ def test_emittime_cache_mlas_matmul(self) -> None: v.check_correctness(function.name, before=(A_test, B_test, C_test), after=(A_test, B_test, C_ref)) def test_runtime_init_cache_mlas_matmul(self) -> None: - from accera.samples.OfflineCacheMatrixMultiplication import RuntimeInitCacheMLAS + from accera.samples.OfflineCacheMatrixMultiplication import \ + RuntimeInitCacheMLAS package = Package() @@ -778,8 +790,6 @@ def _(): def _make_vulkan_gpu_matmul_plan(self, M, N, K): import math - from accera import Target - from accera._lang_python._lang import _If, as_index def get_clamped_block_dimensions(M, N, base_block_dim_M=16, base_block_dim_N=16): return min(M, base_block_dim_M), min(N, base_block_dim_N) @@ -883,8 +893,6 @@ def test_two_vulkan_gpu_matmul(self) -> None: @expectedFailure(FailedReason.NOT_IN_CORE, "function that contains multiple nests") def test_int8_matmul(self) -> None: - from accera import cast - # Define our matrix sizes M = 128 N = 256 @@ -1588,7 +1596,6 @@ def _fn(): def test_dynamic_sub_array_split_dim_subfunction(self) -> None: # This is a contrived way to simply copy an array, but the utilities used are for packing partial higher dimensional arrays - from accera import create_dimensions test_name = "test_dynamic_sub_array_split_dim_subfunction" N = 64 @@ -1650,10 +1657,173 @@ def _outer_fn(): test_output_ref = test_input.copy() v.check_correctness(function.name, before=(test_input, test_output), after=(test_input, test_output_ref)) + def test_dim_bool_operation(self): + N = create_dimensions() + In = Array(Role.INPUT, element_type=ScalarType.float32, shape=(N, )) + Out = Array(Role.INPUT_OUTPUT, element_type=ScalarType.int64, shape=(1, )) + + nest = Nest((1, )) + + @nest.iteration_logic + def _(): + + def T(): + Out[0] = 1 + + def F(): + Out[0] = 0 + + _If(N > 10, T).Else(F) + + name = "test_dim_bool_operation" + package = Package() + function = package.add(nest, args=(N, In, Out), base_name=name) + + In_test = np.random.rand(12).astype(np.float32) + Out_test = np.array([-1], dtype=np.int64) + Out_ref = np.array([1], dtype=np.int64) + + with verifiers.VerifyPackage(self, name, TEST_PACKAGE_DIR) as v: + package.build( + name, + format=self.PACKAGE_FORMAT, + mode=self.PACKAGE_MODE, + output_dir=TEST_PACKAGE_DIR, + ) + + v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref)) + + @expectedFailure(FailedReason.BUG, "unknown reason") + def test_dim_cast_bool_operation(self): + N = create_dimensions() + In = Array(Role.INPUT, element_type=ScalarType.float32, shape=(N, )) + Out = Array(Role.INPUT_OUTPUT, element_type=ScalarType.int64, shape=(1, )) + + nest = Nest((1, )) + + @nest.iteration_logic + def _(): + Out[0] = cast(N > 10, ScalarType.int64) + + name = "test_dim_cast_bool_operation" + package = Package() + function = package.add(nest, args=(N, In, Out), base_name=name) + + In_test = np.random.rand(12).astype(np.float32) + Out_test = np.array([-1], dtype=np.int64) + Out_ref = np.array([1], dtype=np.int64) + + with verifiers.VerifyPackage(self, name, TEST_PACKAGE_DIR) as v: + package.build( + name, + format=self.PACKAGE_FORMAT, + mode=self.PACKAGE_MODE, + output_dir=TEST_PACKAGE_DIR, + ) + + v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref)) + + def test_dim_arithmetic_operation_1(self): + N = create_dimensions() + In = Array(Role.INPUT, element_type=ScalarType.float32, shape=(N, )) + Out = Array(Role.INPUT_OUTPUT, element_type=ScalarType.int64, shape=(1, )) + + nest = Nest((1, )) + + @nest.iteration_logic + def _(): + Out[0] = cast(N, ScalarType.int64) / 3 + + name = "test_dim_arithmetic_operation_1" + package = Package() + function = package.add(nest, args=(N, In, Out), base_name=name) + + In_test = np.random.rand(12).astype(np.float32) + Out_test = np.array([-1], dtype=np.int64) + Out_ref = np.array([12 // 3], dtype=np.int64) + + with verifiers.VerifyPackage(self, name, TEST_PACKAGE_DIR) as v: + package.build( + name, + format=self.PACKAGE_FORMAT, + mode=self.PACKAGE_MODE, + output_dir=TEST_PACKAGE_DIR, + ) + + v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref)) + + def test_dim_arithmetic_operation_2(self): + N1, N2 = create_dimensions() + In = Array(Role.INPUT, element_type=ScalarType.float32, shape=(N1, N2)) + Out = Array(Role.INPUT_OUTPUT, element_type=ScalarType.int64, shape=(1, )) + + nest = Nest((1, )) + + @nest.iteration_logic + def _(): + # is the difference between the dimensions odd + diff = cast(N1, ScalarType.int64) - cast(N2, ScalarType.int64) + is_odd = diff % 2 + Out[0] = is_odd + + name = "test_dim_arithmetic_operation_2" + package = Package() + function = package.add(nest, args=(N1, N2, In, Out), base_name=name) + + In_test = np.random.rand(12, 8).astype(np.float32) + Out_test = np.array([-1], dtype=np.int64) + Out_ref = np.array([(12 - 8) % 2], dtype=np.int64) + + with verifiers.VerifyPackage(self, name, TEST_PACKAGE_DIR) as v: + package.build( + name, + format=self.PACKAGE_FORMAT, + mode=self.PACKAGE_MODE, + output_dir=TEST_PACKAGE_DIR, + ) + + v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref)) + + def test_dim_arithmetic_operation_then_bool_op(self): + N = create_dimensions() + In = Array(Role.INPUT, element_type=ScalarType.float32, shape=(N, )) + Out = Array(Role.INPUT_OUTPUT, element_type=ScalarType.int64, shape=(1, )) + + nest = Nest((1, )) + + @nest.iteration_logic + def _(): + mod2 = cast(N, ScalarType.int64) % 2 + + def T(): + Out[0] = 1 + + def F(): + Out[0] = 0 + + _If(mod2 == 0, T).Else(F) + + name = "test_dim_arithmetic_operation_then_bool_op" + package = Package() + function = package.add(nest, args=(N, In, Out), base_name=name) + + In_test = np.random.rand(12).astype(np.float32) + Out_test = np.array([-1], dtype=np.int64) + Out_ref = np.array([int(12 % 2 == 0)], dtype=np.int64) + + with verifiers.VerifyPackage(self, name, TEST_PACKAGE_DIR) as v: + package.build( + name, + format=self.PACKAGE_FORMAT, + mode=self.PACKAGE_MODE, + output_dir=TEST_PACKAGE_DIR, + ) + + v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref)) + def test_dynamic_sub_array_multi_split_dim_subfunction(self) -> None: # Copy and pack a buffer into 2x4 tiles # Split the flat buffer into a 4-D buffer, where it has a truncated shape in the outer loop's cleanup loop - from accera import create_dimensions test_name = "test_dynamic_sub_array_multi_split_dim_subfunction" @@ -1816,16 +1986,17 @@ def packed_index(i_outer, i_middle, i_inner, j_outer, j_middle, j_inner, tile_of )] = test_input[i_outer + i_middle + i_inner, j_outer + j_middle + j_inner] v.check_correctness(function.name, before=(test_input, test_output), after=(test_input, test_output_ref)) - @expectedFailure(FailedReason.BUG, "_split_dimension of a dynamically sized dimension with a dynamic size is not working") + @expectedFailure( + FailedReason.BUG, "_split_dimension of a dynamically sized dimension with a dynamic size is not working" + ) def test_dynamic_split_dim_dynamic_size(self) -> None: - from accera import create_dimensions test_name = "test_dynamic_split_dim_dynamic_size" M, N, MN = create_dimensions() package = Package() - Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(MN,)) + Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(MN, )) Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(M, N)) nest = Nest(shape=(M, N)) @@ -1836,11 +2007,7 @@ def _(): split_input = Input._split_dimension(0, N) Output[i, j] = split_input[i, j] - fn = package.add( - nest, - args=(MN, M, N, Input, Output), - base_name=f"{test_name}_fn" - ) + fn = package.add(nest, args=(MN, M, N, Input, Output), base_name=f"{test_name}_fn") output_dir = pathlib.Path(TEST_PACKAGE_DIR) / test_name shutil.rmtree(output_dir, ignore_errors=True) @@ -1851,17 +2018,19 @@ def _(): ) # correctness check - test_M = 64 - test_N = 16 - test_MN = test_M*test_N - test_input = np.random.random([test_M*test_N]).astype(np.float32) + test_M = np.int64(64) + test_N = np.int64(16) + test_MN = np.int64(test_M * test_N) + test_input = np.random.random([test_M * test_N]).astype(np.float32) test_output = np.random.random([test_M, test_N]).astype(np.float32) test_output_ref = test_input.copy().reshape((test_M, test_N)) - v.check_correctness(function.name, before=(test_MN, test_M, test_N, test_input, test_output), after=(test_MN, test_M, test_N, test_input, test_output_ref)) + v.check_correctness( + fn.name, + before=(test_MN, test_M, test_N, test_input, test_output), + after=(test_MN, test_M, test_N, test_input, test_output_ref) + ) - @expectedFailure(FailedReason.BUG, "_split_dimension of a dynamically sized dimension with a static size is not working") def test_dynamic_split_dim_static_size(self) -> None: - from accera import create_dimensions test_name = "test_dynamic_split_dim_static_size" M, MN = create_dimensions() @@ -1869,6 +2038,51 @@ def test_dynamic_split_dim_static_size(self) -> None: package = Package() + Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(MN, )) + Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(M, N)) + + nest = Nest(shape=(M, N)) + i, j = nest.get_indices() + + @nest.iteration_logic + def _(): + split_input = Input._split_dimension(0, cast(16, ScalarType.index)) + Output[i, j] = split_input[i, j] + + fn = package.add(nest, args=(MN, M, Input, Output), base_name=f"{test_name}_fn") + + output_dir = pathlib.Path(TEST_PACKAGE_DIR) / test_name + shutil.rmtree(output_dir, ignore_errors=True) + + with verifiers.VerifyPackage(self, test_name, output_dir) as v: + package.build( + name=test_name, format=self.PACKAGE_FORMAT, mode=self.PACKAGE_MODE, output_dir=output_dir, _quiet=False + ) + + # correctness check + test_M = np.int64(64) + test_N = N + test_MN = np.int64(test_M * test_N) + test_input = np.random.random([test_M * test_N]).astype(np.float32) + test_output = np.random.random([test_M, test_N]).astype(np.float32) + test_output_ref = test_input.copy().reshape((test_M, test_N)) + v.check_correctness( + fn.name, + before=(test_MN, test_M, test_input, test_output), + after=(test_MN, test_M, test_input, test_output_ref) + ) + + # This test uses all static sizes to make sure the fix for dynamic size (test_dynamic_split_dim_static_size) + # won't regress the static size case. + def test_dynamic_split_dim_all_static(self) -> None: + test_name = "test_dynamic_split_dim_all_static" + + M = 8 + MN = 128 + N = 16 + + package = Package() + Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(MN,)) Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(M, N)) @@ -1882,7 +2096,7 @@ def _(): fn = package.add( nest, - args=(MN, M, Input, Output), + args=(Input, Output), base_name=f"{test_name}_fn" ) @@ -1895,13 +2109,14 @@ def _(): ) # correctness check - test_M = 64 + test_M = 8 test_N = N test_MN = test_M*test_N test_input = np.random.random([test_M*test_N]).astype(np.float32) test_output = np.random.random([test_M, test_N]).astype(np.float32) test_output_ref = test_input.copy().reshape((test_M, test_N)) - v.check_correctness(function.name, before=(test_MN, test_M, test_input, test_output), after=(test_MN, test_M, test_input, test_output_ref)) + v.check_correctness(fn.name, before=(test_input, test_output), after=(test_input, test_output_ref)) + def test_padded_nchwc_conv2d_manual_cache(self) -> None: input_channels = 64 @@ -2066,7 +2281,6 @@ def _(): package.build(name=package_name, format=self.PACKAGE_FORMAT, mode=self.PACKAGE_MODE, output_dir=output_dir) def test_cross_compile(self) -> None: - from accera import Target M = 128 N = 256 K = 256 @@ -2891,8 +3105,6 @@ def test_boundary_differently_shaped_budget_cache(self) -> None: self._verify_matrix_multiplication_function(function, package, f"test_boundary_differently_shaped_budget_cache") def test_gpu_vec_add(self): - from accera import Array, Nest, Package, ScalarType, Target - # Define our vector sizes N = 2**16 block_x = 256 @@ -2941,8 +3153,6 @@ def _(): v.check_correctness(function.name, before=before, after=after) def _test_gpu_vec_add_boundary(self, N, splits, test_name): - from accera import Array, Nest, Package, ScalarType, Target - A = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, )) B = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, )) C = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N, )) @@ -3003,8 +3213,6 @@ def _(): v.check_correctness(function.name, before=before, after=after) def _test_cpu_vec_add_boundary(self, N, splits, test_name): - from accera import Array, Nest, Package, ScalarType, Target - A = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, )) B = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, )) C = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N, )) @@ -3063,9 +3271,6 @@ def test_gpu_vec_add_gpu_boundary_2_splits_cpuonly(self): self._test_cpu_vec_add_boundary(1280, [512, 64], inspect.currentframe().f_code.co_name) def _add_cuda_copy_kernel(self, package, N, block_x, block_y, target, basename="cuda_copy_kernel"): - from accera import Array, Nest, ScalarType - from accera._lang_python._lang import _MemorySpace - In = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, N)) Out = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N, N)) @@ -3098,8 +3303,6 @@ def _(): return function def test_cuda_module_output(self) -> None: - from accera import Package, Target - N = 2048 block_x = 16 block_y = block_x @@ -3128,8 +3331,6 @@ def test_cuda_module_output(self) -> None: v.check_correctness(function.name, before=(Input_test, Output_test), after=(Input_ref, Output_ref)) def test_cuda_multiple_funcs(self) -> None: - from accera import Package, Target - Ns = [1024, 2048] block_x = 16 block_y = block_x @@ -3161,9 +3362,6 @@ def test_cuda_multiple_funcs(self) -> None: v.check_correctness(function.name, before=(Input_test, Output_test), after=(Input_ref, Output_ref)) def _add_rocm_copy_kernel(self, package, N, block_x, block_y, target, basename="rocm_copy_kernel"): - from accera import Array, Nest, ScalarType - from accera._lang_python._lang import _MemorySpace - In = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, N)) Out = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N, N)) @@ -3197,8 +3395,6 @@ def _(): return function def test_rocm_module_output(self) -> None: - from accera import Package, Target - # Define our vector sizes N = 32 block_x = 16 @@ -3228,8 +3424,6 @@ def test_rocm_module_output(self) -> None: v.check_correctness(function.name, before=before, after=after) def test_rocm_multiple_funcs(self) -> None: - from accera import Package, Target - Ns = [1024, 2048] block_x = 16 block_y = block_x @@ -3272,8 +3466,6 @@ def _gpu_cache( double_buffer=False, double_buffer_location=Constants.AUTO ) -> None: - from accera import Array, Nest, Package, ScalarType, Target - A = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(M, K), layout=Array.Layout.FIRST_MAJOR) B = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(K, N), layout=Array.Layout.FIRST_MAJOR) C = Array( @@ -3343,9 +3535,6 @@ def test_gpu_cache_double_buffering(self) -> None: self._gpu_cache(2560, 1536, 2048, 16, 16, 32, "test_gpu_cache_double_buffering", True) def test_gpu_cache_double_buffering_trigger_index(self) -> None: - from accera import Array, Nest, Package, ScalarType, Target - from accera._lang_python._lang import _MemorySpace - M = 2560 N = 1536 K = 2048 @@ -3426,8 +3615,6 @@ def test_gpu_cache_double_buffering_mem_space(self) -> None: ) def test_cpu_cache_double_buffering_trigger_index(self) -> None: - from accera import Array, Nest, Package, ScalarType - M = 1024 N = 1024 K = 1024 @@ -3511,7 +3698,6 @@ def _(): # TODO : move vpmaddwd tests to a different test file def test_signextend_int16_matmul_vpmaddwd(self): - from accera import AllocateFlags, create_dimensions test_name = "test_signextend_int16_matmul_vpmaddwd" def inout_array(arr: Array): @@ -4251,10 +4437,10 @@ def test_f32_horizontal_vector_add_1_row(self): test_name = "test_f32_horizontal_vector_add_1_row" N = 8 - A = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N,), layout=Array.Layout.FIRST_MAJOR) + A = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N, ), layout=Array.Layout.FIRST_MAJOR) B = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(1, ), layout=Array.Layout.FIRST_MAJOR) - nest = Nest(shape=(N,)) + nest = Nest(shape=(N, )) i, = nest.get_indices() @nest.iteration_logic @@ -4268,8 +4454,8 @@ def _(): package = Package() function = package.add(plan, args=(A, B), base_name=test_name) - A_test = np.random.random((N,)).astype(np.float32) - B_test = np.random.random((1,)).astype(np.float32) + A_test = np.random.random((N, )).astype(np.float32) + B_test = np.random.random((1, )).astype(np.float32) B_ref = B_test.copy() for i in range(N): @@ -4300,10 +4486,10 @@ def test_i32_horizontal_vector_add_1_row(self): test_name = "test_i32_horizontal_vector_add_1_row" N = 8 - A = Array(role=Role.INPUT, element_type=ScalarType.int32, shape=(N,), layout=Array.Layout.FIRST_MAJOR) + A = Array(role=Role.INPUT, element_type=ScalarType.int32, shape=(N, ), layout=Array.Layout.FIRST_MAJOR) B = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.int32, shape=(1, ), layout=Array.Layout.FIRST_MAJOR) - nest = Nest(shape=(N,)) + nest = Nest(shape=(N, )) i, = nest.get_indices() @nest.iteration_logic @@ -4317,8 +4503,8 @@ def _(): package = Package() function = package.add(plan, args=(A, B), base_name=test_name) - A_test = np.random.random((N,)).astype(np.int32) - B_test = np.random.random((1,)).astype(np.int32) + A_test = np.random.random((N, )).astype(np.int32) + B_test = np.random.random((1, )).astype(np.int32) B_ref = B_test.copy() for i in range(N): @@ -4612,11 +4798,6 @@ def test_matmul_input_cache_element_type_uint_to_int(self) -> None: ) def test_gpu_barrier_opt(self) -> None: - from accera import Array, Nest, Package, ScalarType, Target - from accera._lang_python._lang import Allocate, _MemorySpace, Array as NativeArray - from accera._lang_python._lang._gpu import Barrier - from accera._lang_python import _MemoryLayout - N = 256 block_x = 16 @@ -4682,8 +4863,6 @@ def _(): v.check_correctness(function.name, before=before, after=after) def test_rocm_gemm_tiled_output(self) -> None: - from accera import Array, Nest, Package, ScalarType, Target - M = 16 N = M K = M @@ -5448,8 +5627,6 @@ def run_file_check(verifier): # import accera as acc # # TODO : update once MemorySpace is better surfaced - # from accera._lang_python._lang import _MemorySpace - # package = Package() # M = 32 @@ -5502,8 +5679,6 @@ def run_file_check(verifier): # import accera as acc # # TODO : update once MemorySpace is better surfaced - # from accera._lang_python._lang import _MemorySpace - # package = Package() # M = 32 @@ -5735,8 +5910,6 @@ def run_file_check(verifier): # self._verify_matrix_multiplication_function(function, package, f"test_thrifty_caching_elide_boundary_no_elide_main") def test_gpu_cache_different_input_layouts(self): - from accera import Array, Nest, Package, ScalarType, Target - from accera._lang_python._lang import _MemorySpace M = 2560 N = 1536 @@ -5858,9 +6031,6 @@ def test_gpu_cache_block_level_private_mem(self): # This test verifies that a private memory cache will compute a region specific to each thread # even when added at the block level of the loopnest - from accera import Array, Nest, Package, ScalarType, Target - from accera._lang_python._lang import _MemorySpace - M = 2560 N = 1536 K = 2048 @@ -5943,9 +6113,6 @@ def test_gpu_cache_block_level_shared_mem(self): # This test verifies that a shared memory cache will compute a region specific to each logical block # even when added outside the block level of the loopnest - from accera import Array, Nest, Package, ScalarType, Target - from accera._lang_python._lang import _MemorySpace - M = 2560 N = 1536 K = 2048 @@ -6030,9 +6197,6 @@ def test_gpu_cache_block_level_global_mem(self): # This test verifies that a global memory cache will compute a region specific to each logical block # even when added outside the block level of the loopnest - from accera import Array, Nest, Package, ScalarType, Target - from accera._lang_python._lang import _MemorySpace - M = 2560 N = 1536 K = 2048 @@ -6114,7 +6278,6 @@ def file_check_fn(verifier): ) def test_vectorized_and_unvectorized_cpu_caches(self): - from accera import AUTO M = 512 N = 512 S = 512 @@ -6165,8 +6328,6 @@ def _(): self._verify_matrix_multiplication_function(function, package, f"test_vectorized_and_unvectorized_cpu_caches") def test_rocm_cache_double_buffering__with_c_cache_tensorize(self) -> None: - from accera import Array, Nest, Package, ScalarType, Target - M = 1024 N = 1024 K = 1024 @@ -6247,8 +6408,6 @@ def _(): ) def test_rocm_c_cache_private(self) -> None: - from accera import Array, Nest, Package, ScalarType, Target - M = 1024 N = 1024 K = 1024 @@ -6311,9 +6470,6 @@ def _(): ) def test_fill_fp16(self): - from accera import Array, Nest, Package, ScalarType - from accera import cast - # Define our vector sizes N = 2**16 @@ -6348,9 +6504,6 @@ def fill_fp16(): v.check_correctness(function.name, before=(Output_test, ), after=(Output_ref, )) def test_abs_fp16(self): - from accera import Array, Nest, Package, ScalarType, Target - from accera import abs - # Define our vector sizes N = 16 @@ -6362,7 +6515,7 @@ def test_abs_fp16(self): @nest.iteration_logic def _(): - Out[i] = abs(In[i]) + Out[i] = accabs(In[i]) schedule = nest.create_schedule() plan = schedule.create_plan() @@ -6386,8 +6539,6 @@ def abs_fp16(a): v.check_correctness(function.name, before=(Input_test, Output_test), after=(Input_test, Output_ref)) def test_vec_add_fp16(self): - from accera import Array, Nest, Package, ScalarType - # Define our vector sizes N = 2**16 @@ -6430,8 +6581,6 @@ def vecadd_ref(a, b): ) def test_rocm_tensorize_fp16(self) -> None: - from accera import Array, Nest, Package, ScalarType, Target - M = 1024 N = 1024 K = 1024 @@ -6494,8 +6643,6 @@ def _(): ) def test_rocm_cache_double_buffering_tensorize_fp16(self) -> None: - from accera import Array, Nest, Package, ScalarType, Target - M = 1024 N = 1024 K = 1024 @@ -6575,8 +6722,6 @@ def _(): ) def test_rocm_double_buffer_small_cache_vectorized_unvectorized_tensorized(self) -> None: - from accera import Array, Nest, Package, ScalarType, Target - M = 512 N = 512 K = 512 @@ -6907,7 +7052,6 @@ def _(): ) def test_vectorized_masked_buffer_fill(self) -> None: - from accera._lang_python._lang import _If N_input = 5 N_output = 8 Input = Array(role=Role.INPUT, element_type=ScalarType.int32, shape=(N_input, )) @@ -6943,7 +7087,6 @@ def store_zero(): ) def test_vectorized_masked_store(self) -> None: - from accera._lang_python._lang import _If N_input = 8 N_output = 5 Input = Array(role=Role.INPUT, element_type=ScalarType.int32, shape=(N_input, )) @@ -6976,7 +7119,6 @@ def store_value(): ) def test_vectorized_masked_accumulate(self) -> None: - from accera._lang_python._lang import _If N_input = 8 N_output = 5 Input = Array(role=Role.INPUT, element_type=ScalarType.int32, shape=(N_input, )) @@ -7008,6 +7150,40 @@ def store_value(): output_dir=output_dir ) + def test_vectorized_masked_constant_scale_store(self) -> None: + from accera._lang_python._lang import _If + package_name = "test_vectorized_masked_constant_scale_store" + N_input = 8 + N_output = 5 + scale = 0.2 + Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(N_input, )) + Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(N_output, )) + package = Package() + nest = Nest(shape=(N_input, )) + i, = nest.get_indices() + + @nest.iteration_logic + def _nest(): + + def store_value(): + Output[i] = Input[i] * scale + + _If(i < N_output, store_value) + + sched = nest.create_schedule() + plan = sched.create_plan() + plan.vectorize(i) + fn = package.add(plan, args=(Input, Output), base_name=package_name) + output_dir = pathlib.Path(TEST_PACKAGE_DIR) / package_name + shutil.rmtree(output_dir, ignore_errors=True) + with verifiers.VerifyPackage(self, package_name, output_dir) as v: + package.build( + name=package_name, + format=self.PACKAGE_FORMAT | Package.Format.MLIR_VERBOSE, + mode=self.PACKAGE_MODE, + output_dir=output_dir + ) + def test_packing_floordiv_mod_no_splits(self) -> None: package_name = "test_packing_floordiv_mod_no_splits" M = 256 @@ -7107,6 +7283,83 @@ def _nest(): output_dir=output_dir ) + def test_dynamic_temp_array(self) -> None: + import accera as acc + test_name = "test_dynamic_temp_array" + + package = Package() + + M, N = acc.create_dimensions() + + A = acc.Array(role=acc.Role.INPUT, shape=(M, N), layout=acc.Array.Layout.FIRST_MAJOR) + B = acc.Array(role=acc.Role.TEMP, shape=(M, N), layout=acc.Array.Layout.FIRST_MAJOR) + C = acc.Array(role=acc.Role.INPUT_OUTPUT, shape=(N, M), layout=acc.Array.Layout.FIRST_MAJOR) + + nest = acc.Nest(shape=(M, N)) + i, j = nest.get_indices() + + @nest.iteration_logic + def _(): + B[i, j] = A[i, j] + C[j, i] = B[i, j] + + function = package.add(nest, args=(M, N, A, C), base_name=test_name) + + output_dir = pathlib.Path(TEST_PACKAGE_DIR) / test_name + with verifiers.VerifyPackage(self, test_name, output_dir) as v: + package.build( + name=test_name, + format=self.PACKAGE_FORMAT | Package.Format.MLIR, + mode=self.PACKAGE_MODE, + output_dir=output_dir + ) + + checker = v.file_checker(f"*_ConvertValueToStd.mlir") + checker.check_not('memref.global') + checker.check_not('memref.get_global') + checker.check('memref.alloc(%arg0, %arg1)') + checker.check('memref.dealloc') + checker.run() + + def test_static_temp_array(self) -> None: + import accera as acc + test_name = "test_static_temp_array" + + package = Package() + + M = 16 + N = 32 + + A = acc.Array(role=acc.Role.INPUT, shape=(M, N), layout=acc.Array.Layout.FIRST_MAJOR) + B = acc.Array(role=acc.Role.TEMP, shape=(M, N), layout=acc.Array.Layout.FIRST_MAJOR) + C = acc.Array(role=acc.Role.INPUT_OUTPUT, shape=(N, M), layout=acc.Array.Layout.FIRST_MAJOR) + + nest = acc.Nest(shape=(M, N)) + i, j = nest.get_indices() + + @nest.iteration_logic + def _(): + B[i, j] = A[i, j] + C[j, i] = B[i, j] + + function = package.add(nest, args=(A, C), base_name=test_name) + + output_dir = pathlib.Path(TEST_PACKAGE_DIR) / test_name + with verifiers.VerifyPackage(self, test_name, output_dir) as v: + package.build( + name=test_name, + format=self.PACKAGE_FORMAT | Package.Format.MLIR, + mode=self.PACKAGE_MODE, + output_dir=output_dir + ) + + checker = v.file_checker(f"*_ConvertValueToStd.mlir") + checker.check('memref.global') + checker.check('memref.get_global') + checker.check_not('memref.alloc') + checker.check_not('memref.dealloc') + checker.run() + if __name__ == '__main__': unittest.main(verbosity=10) diff --git a/accera/transforms/include/value/ValueToLLVMLoweringPass.h b/accera/transforms/include/value/ValueToLLVMLoweringPass.h index ffc6cd4c..62a73dc4 100644 --- a/accera/transforms/include/value/ValueToLLVMLoweringPass.h +++ b/accera/transforms/include/value/ValueToLLVMLoweringPass.h @@ -38,6 +38,7 @@ void populateValueToLLVMNonMemPatterns(mlir::LLVMTypeConverter& typeConverter, m void populateGlobalValueToLLVMNonMemPatterns(mlir::LLVMTypeConverter& typeConverter, mlir::RewritePatternSet& patterns); void populateLocalValueToLLVMNonMemPatterns(mlir::LLVMTypeConverter& typeConverter, mlir::RewritePatternSet& patterns); void populateValueToLLVMMemPatterns(mlir::LLVMTypeConverter& typeConverter, mlir::RewritePatternSet& patterns); +void populateReshapeOpToLLVMMemPatterns(mlir::LLVMTypeConverter& typeConverter, mlir::RewritePatternSet& patterns); const mlir::LowerToLLVMOptions& GetDefaultAcceraLLVMOptions(mlir::MLIRContext* context); std::unique_ptr> createValueToLLVMPass(mlir::LowerToLLVMOptions options); diff --git a/accera/transforms/include/value/ValueToStandardLoweringPass.h b/accera/transforms/include/value/ValueToStandardLoweringPass.h index f7682ec3..65bd4cbc 100644 --- a/accera/transforms/include/value/ValueToStandardLoweringPass.h +++ b/accera/transforms/include/value/ValueToStandardLoweringPass.h @@ -27,7 +27,7 @@ struct ProfileRegions; namespace accera::transforms::value { void populateVectorizeValueOpPatterns(mlir::RewritePatternSet& patterns); -void populateValueToStandardPatterns(bool enableProfiling, ProfileRegions& profileRegions, mlir::RewritePatternSet& patterns); +[[maybe_unused]] void populateValueToStandardPatterns(bool enableProfiling, ProfileRegions& profileRegions, mlir::RewritePatternSet& patterns); void populateValueLaunchFuncPatterns(mlir::RewritePatternSet& patterns); void populateValueModuleRewritePatterns(mlir::RewritePatternSet& patterns); diff --git a/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp b/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp index 21989a90..562bcc1c 100644 --- a/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp +++ b/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp @@ -744,7 +744,7 @@ std::vector GetMajorToMinorDimensionTraversal(const mlir::MemRefType& so llvm::SmallVector strides; int64_t offset; - auto strideResult = mlir::getStridesAndOffset(sourceType, strides, offset); + [[maybe_unused]] auto strideResult = mlir::getStridesAndOffset(sourceType, strides, offset); assert(succeeded(strideResult)); std::vector> strideAndLogicalDims; size_t dim = 0; @@ -1081,7 +1081,7 @@ struct MultiCacheInfo std::pair GetAccessValueAndIndices(Operation* loadOrStoreOp) { - bool isLoadOrStore = isa(loadOrStoreOp); + [[maybe_unused]] bool isLoadOrStore = isa(loadOrStoreOp); assert(isLoadOrStore); if (auto stdStoreOp = dyn_cast_or_null(loadOrStoreOp)) { @@ -1123,7 +1123,7 @@ std::pair GetAccessValueAndIndices(Operation* loa v::MMALoadSyncOp::Adaptor adaptor{ valueMMALoadSyncOp }; return std::make_pair(adaptor.memref(), adaptor.indices()); } - assert(false && "Unhandled load/store case"); + throw LogicException(LogicExceptionErrors::notImplemented, "Unhandled load/store case"); } bool ComputeRegionAccessedByOp(PatternRewriter& rewriter, mlir::MemRefRegion& activeBlockRegion, mlir::Operation* op, unsigned loopDepth, const std::unordered_map& handlesToKeepParametric = {}) @@ -1133,7 +1133,7 @@ bool ComputeRegionAccessedByOp(PatternRewriter& rewriter, mlir::MemRefRegion& ac if (isa(op)) { - auto result = ComputeMemrefRegion(activeBlockRegion, op, loopDepth, nullptr, false, handlesToKeepParametric); + [[maybe_unused]] auto result = ComputeMemrefRegion(activeBlockRegion, op, loopDepth, nullptr, false, handlesToKeepParametric); assert(succeeded(result)); return true; } @@ -1163,11 +1163,11 @@ bool IsCacheParametricOnGPUProc(mlir::Attribute cacheMemSpace, v::Processor gpuP gpuProc == v::Processor::BlockY || gpuProc == v::Processor::BlockZ; - bool isThread = gpuProc == v::Processor::ThreadX || - gpuProc == v::Processor::ThreadY || - gpuProc == v::Processor::ThreadZ || - gpuProc == v::Processor::WarpX || - gpuProc == v::Processor::WarpY; + [[maybe_unused]] bool isThread = gpuProc == v::Processor::ThreadX || + gpuProc == v::Processor::ThreadY || + gpuProc == v::Processor::ThreadZ || + gpuProc == v::Processor::WarpX || + gpuProc == v::Processor::WarpY; assert((isBlock || isThread) && "Loops that are bound to GPU proc handles other than block and thread handles are not well defined"); @@ -1277,7 +1277,7 @@ std::optional ComputeAccessInfoForArrayAtLevel(PatternRewriter& { // Validate that the held loop has the same gpuProc and gpuProcMap as the current one, otherwise throw auto heldForOp = mlir::getForInductionVarOwner(findIt->second); - auto&& [heldGpuProc, heldGpuProcMap] = getProcAndMap(heldForOp); + [[maybe_unused]] auto&& [heldGpuProc, heldGpuProcMap] = getProcAndMap(heldForOp); assert((heldGpuProc == gpuProc) && (heldGpuProcMap == gpuProcMap) && "Found duplicate index amongst for ops but they had different GPU mappings"); } else @@ -1408,7 +1408,7 @@ std::optional ComputeAccessInfoForArrayAtLevel(PatternRewriter& } else { - auto unionResult = result.activeBlock.unionBoundingBox(activeBlockRegion); + [[maybe_unused]] auto unionResult = result.activeBlock.unionBoundingBox(activeBlockRegion); assert(succeeded(unionResult)); result.activeBlock.cst.removeRedundantConstraints(); @@ -1749,6 +1749,7 @@ mlir::Value GetOriginalIV(mlir::Value possiblyOffsetIV) assert(false && "Offset IVs must be offset with AffineApplyOps and constants"); } } + return nullptr; } mlir::AffineMap ComputeLoopIVToDefinitionOrderMap(const std::vector& ivs, mlir::MLIRContext* context) @@ -1802,7 +1803,7 @@ mlir::AffineMap ComputeLoopIVToDefinitionOrderMap(const std::vector otherDefiningOp = otherIVBlockArg.getOwner()->getParentOp(); } bool currentIsAncestor = currentDefiningOp->isAncestor(otherDefiningOp); - bool otherIsAncestor = otherDefiningOp->isAncestor(currentDefiningOp); + [[maybe_unused]] bool otherIsAncestor = otherDefiningOp->isAncestor(currentDefiningOp); assert((currentIsAncestor || otherIsAncestor) && "ComputeLoopIVDefinitionOrder only works on nested AffineForOp IVs"); return currentIsAncestor; }); @@ -2400,7 +2401,7 @@ LogicalResult ActiveElementCacheCopyOpRewrite::matchAndRewrite(ActiveElementCach assert(dst.getType().isa()); auto dstMemRefType = dst.getType().cast(); const v::MemorySpace dstMemRefSpace{ dstMemRefType.getMemorySpaceAsInt() }; - auto baseDstElementType = GetInnerElementType(dst); // e.g. f32 + [[maybe_unused]] auto baseDstElementType = GetInnerElementType(dst); // e.g. f32 assert(baseSrcElementType == baseDstElementType && "Copy source and dest data types don't match"); @@ -2794,7 +2795,7 @@ LogicalResult ActiveBlockCacheCopyOpRewrite::matchAndRewrite(ActiveBlockCacheCop llvm::SmallVector outerArrayStrides; int64_t activeBlockOffset; // TODO : do we need to leverage this in any way? we're currently just arranging the threads according to fast/slow dimensions of the logical memref - auto strideResult = mlir::getStridesAndOffset(memRefType, outerArrayStrides, activeBlockOffset); + [[maybe_unused]] auto strideResult = mlir::getStridesAndOffset(memRefType, outerArrayStrides, activeBlockOffset); assert(succeeded(strideResult)); auto numOuterArrayMultiCacheDims = outerArrayStrides.size() - activeBlockRank; std::vector outerArrayActiveBlockStrides(outerArrayStrides.begin() + numOuterArrayMultiCacheDims, outerArrayStrides.end()); @@ -3803,6 +3804,7 @@ mlir::Value FindParentAffineForOpIV(mlir::Operation* op, const Index& loopnestIn currentParentForOp = currentParentForOp->getParentOfType(); } assert(false && "Given loopnest index does not correspond to a parent AffineForOp"); + return nullptr; } std::vector ResolveParentRelevantScheduleIndices(mlir::Operation* op, const mlir::ValueRange& baseRelevantScheduleIndices) @@ -4660,7 +4662,7 @@ LogicalResult MergeCacheRegionOpsRewrite::matchAndRewrite(BeginCreateCacheOp beg std::vector beginOpsForRemoval; std::vector endOpsForRemoval; - auto baseArray = beginCreateCacheOp.baseInput(); + [[maybe_unused]] auto baseArray = beginCreateCacheOp.baseInput(); // If the outermost loop in this cache region is used to index into the base array then the cache regions cannot be merged // as they will have different access patterns @@ -5135,7 +5137,7 @@ LogicalResult BeginCreateCacheOpRewrite::matchAndRewrite(BeginCreateCacheOp begi if (!cachesHaveSameShape) { - auto unionResult = matchingExistingInfoIter->arrayAccessInfo.activeBlock.unionBoundingBox(currentMultiCacheInfo.arrayAccessInfo.activeBlock); + [[maybe_unused]] auto unionResult = matchingExistingInfoIter->arrayAccessInfo.activeBlock.unionBoundingBox(currentMultiCacheInfo.arrayAccessInfo.activeBlock); assert(succeeded(unionResult)); matchingExistingInfoIter->arrayAccessInfo.activeBlock.cst.removeRedundantConstraints(); } @@ -6743,12 +6745,23 @@ LogicalResult HoistScalingToCacheReduceRewrite::matchAndRewrite(mlir::AffineStor Operation* targetCacheReduceOpOperation = nullptr; for (auto& cacheReduceOp : activeBlockCacheReduceOps) { - auto cacheReduceBlock = cacheReduceOp->getBlock(); - auto ancestorOp = cacheReduceBlock->findAncestorOpInBlock(*affineStoreOp.getOperation()); - if (ancestorOp) + // If the cache reduce op is still inside of an un-expanded loopnest, then wait for that loopnest to expand before attempting to hoist scales + if (cacheReduceOp->getParentOfType()) { - assert(targetCacheReduceOpOperation == nullptr); // Only expect one cache reduce op to be a candidate - targetCacheReduceOpOperation = cacheReduceOp; + return failure(); + } + + // Cache reduce ops are inside of lambdas to control for GPU block/thread shifting of the cache + // So find the parent LambdaOp and check that LambdaOp's parent block + if (auto parentLambdaOp = cacheReduceOp->getParentOfType()) + { + auto parentBlock = parentLambdaOp->getBlock(); + auto ancestorOp = parentBlock->findAncestorOpInBlock(*affineStoreOp.getOperation()); + if (ancestorOp) + { + assert(targetCacheReduceOpOperation == nullptr); // Only expect one cache reduce op to be a candidate + targetCacheReduceOpOperation = cacheReduceOp; + } } } for (auto& cacheReduceOp : activeElementCacheReduceOps) diff --git a/accera/transforms/src/gpu/ConvertLaunchFuncToVulkanCalls.cpp b/accera/transforms/src/gpu/ConvertLaunchFuncToVulkanCalls.cpp index 159c5a9e..0fbc008b 100644 --- a/accera/transforms/src/gpu/ConvertLaunchFuncToVulkanCalls.cpp +++ b/accera/transforms/src/gpu/ConvertLaunchFuncToVulkanCalls.cpp @@ -56,8 +56,8 @@ static constexpr const char* kVulkanLaunch = "vulkanLaunch"; // Custom to acc-vulkan-runtime-wrappers static constexpr const char* kSetRepeatedRunCharacteristics = "setRepeatedRunCharacteristics"; static constexpr const char* kVulkanRuntimeInstanceHandle = "VulkanRuntime_Instance_Handle"; -static constexpr const char* kInitVulkanEmittedFunc = "initVulkanUtilities"; -static constexpr const char* kDestroyVulkanEmittedFunc = "destroyVulkanUtilities"; +[[maybe_unused]] static constexpr const char* kInitVulkanEmittedFunc = "initVulkanUtilities"; +[[maybe_unused]] static constexpr const char* kDestroyVulkanEmittedFunc = "destroyVulkanUtilities"; static constexpr const char* kInitializeFuncAttrName = "rc_gpu_init"; static constexpr const char* kDeInitializeFuncAttrName = "rc_gpu_deinit"; static constexpr const char* kVulkanRuntimeHandleAccessor = "getVulkanRuntimeInstance"; diff --git a/accera/transforms/src/nest/LoopNestToValueFunc.cpp b/accera/transforms/src/nest/LoopNestToValueFunc.cpp index bb258ad5..779f8e52 100644 --- a/accera/transforms/src/nest/LoopNestToValueFunc.cpp +++ b/accera/transforms/src/nest/LoopNestToValueFunc.cpp @@ -221,16 +221,16 @@ struct LoopNestToValueFuncPass : public accera::transforms::LoopNestToValueFuncB { RewritePatternSet patterns(context); - xptr::populateExecutionPlanScaleHoistingPatterns(patterns); + utilir::FillCanonicalPatternsRecursively(vFuncOp, patterns); (void)applyPatternsAndFoldGreedily(vFuncOp, std::move(patterns)); - snapshotter.Snapshot("ExecutionPlanScaleHoisting", vFuncOp); + snapshotter.Snapshot("Canonicalize", vFuncOp); } { RewritePatternSet patterns(context); - utilir::FillCanonicalPatternsRecursively(vFuncOp, patterns); + xptr::populateExecutionPlanScaleHoistingPatterns(patterns); (void)applyPatternsAndFoldGreedily(vFuncOp, std::move(patterns)); - snapshotter.Snapshot("Canonicalize", vFuncOp); + snapshotter.Snapshot("ExecutionPlanScaleHoisting", vFuncOp); } { diff --git a/accera/transforms/src/util/RangeValueUtilities.cpp b/accera/transforms/src/util/RangeValueUtilities.cpp index b328e1f8..d8b25b89 100644 --- a/accera/transforms/src/util/RangeValueUtilities.cpp +++ b/accera/transforms/src/util/RangeValueUtilities.cpp @@ -390,11 +390,10 @@ RangeValue RangeValueAnalysis::resolveRangeValue(AffineApplyOp op) llvmBinOp = Instruction::BinaryOps::SDiv; break; case mlir::AffineExprKind::CeilDiv: - assert(false); // Unsupported currently - no matching llvm bin op - break; + // Unsupported currently - no matching llvm bin op + throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, "CeilDiv is not implemented"); default: - assert(false); - break; + throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, "Unsupported binary op expression"); } llvm::SmallVector operandRanges{ lhsRv, rhsRv }; auto rv = resolveRangeValue(llvmBinOp, operandRanges); diff --git a/accera/transforms/src/value/ValueFuncToTargetPass.cpp b/accera/transforms/src/value/ValueFuncToTargetPass.cpp index 1933dd3a..5c079a1a 100644 --- a/accera/transforms/src/value/ValueFuncToTargetPass.cpp +++ b/accera/transforms/src/value/ValueFuncToTargetPass.cpp @@ -93,9 +93,6 @@ void mapValueTypeAttr(OpT& op, mlir::BlockAndValueMapping& mapping) }); } -constexpr auto kDefaultExecutionTarget = vir::ExecutionTarget::CPU; -constexpr size_t kLaunchConfigNumDims = 6; - struct ValueFuncToTargetPass : public tr::ValueFuncToTargetBase { ValueFuncToTargetPass(const tr::IntraPassSnapshotOptions& options = {}) : diff --git a/accera/transforms/src/value/ValueToLLVMLoweringPass.cpp b/accera/transforms/src/value/ValueToLLVMLoweringPass.cpp index 4f5e4025..f163bb4b 100644 --- a/accera/transforms/src/value/ValueToLLVMLoweringPass.cpp +++ b/accera/transforms/src/value/ValueToLLVMLoweringPass.cpp @@ -677,7 +677,7 @@ struct VpmaddwdOpLowering : public ValueLLVMOpConversionPattern LLVMTypeConverter llvmTypeConverter(rewriter.getContext()); auto outputVecType = op.getType().cast(); auto outputVecLLVMType = llvmTypeConverter.convertType(outputVecType); - auto outputRank = outputVecType.getRank(); + [[maybe_unused]] auto outputRank = outputVecType.getRank(); assert(outputRank == 1 && "Vpmaddwd op should have a 1-D result"); auto elementCount = outputVecType.getShape()[0]; auto avx512Support = util::ModuleSupportsTargetDeviceFeature(op, "avx512"); @@ -1129,6 +1129,221 @@ struct RawPointerAPIUnusedUndefRemoval : public OpRewritePattern } }; +static OpFoldResult getExpandedDimSize( + OpBuilder &builder, + Location loc, + Type &llvmIndexType, + int64_t outDimIndex, ArrayRef outStaticShape, + MemRefDescriptor &inDesc, + ArrayRef inStaticShape, + ArrayRef reassocation, + DenseMap &outDimToInDimMap) +{ + int64_t outDimSize = outStaticShape[outDimIndex]; + if (!ShapedType::isDynamic(outDimSize)) + { + return builder.getIndexAttr(outDimSize); + } + + // Calculate the multiplication of all the out dim sizes except the current dim. + int64_t inDimIndex = outDimToInDimMap[outDimIndex]; + int64_t otherDimSizesMul = 1; + + for (auto otherDimIndex : reassocation[inDimIndex]) + { + if (otherDimIndex == static_cast(outDimIndex)) + { + continue; + } + otherDimSizesMul *= outStaticShape[otherDimIndex]; + } + + // outDimSize = inDimSize / otherOutDimSizesMul + int64_t inDimSize = inStaticShape[inDimIndex]; + Value inDimSizeDynamic = + ShapedType::isDynamic(inDimSize) + ? inDesc.size(builder, loc, inDimIndex) + : builder.create(loc, llvmIndexType, builder.getIndexAttr(inDimSize)); + + Value outDimSizeDynamic = builder.create( + loc, + inDimSizeDynamic, + builder.create(loc, llvmIndexType, builder.getIndexAttr(otherDimSizesMul))); + + return outDimSizeDynamic; +} + +// Compute a map that for a given dimension of the expanded type gives the +// dimension in the collapsed type it maps to. Essentially its the inverse of the `reassocation` maps. +static DenseMap getExpandedDimToOriginalDimMap(ArrayRef reassociation) +{ + llvm::DenseMap dimMap; + for (auto &dimArray : enumerate(reassociation)) + { + for (auto dim : dimArray.value()) + { + dimMap[dim] = dimArray.index(); + } + } + return dimMap; +} + +static SmallVector getExpandedShape( + OpBuilder &builder, + Location loc, + Type &llvmIndexType, + ArrayRef reassociation, + ArrayRef inStaticShape, + MemRefDescriptor &inDesc, + ArrayRef outStaticShape) +{ + DenseMap outDimToInDimMap = getExpandedDimToOriginalDimMap(reassociation); + return llvm::to_vector<4>(llvm::map_range( + llvm::seq(0, outStaticShape.size()), [&](int64_t outDimIndex) { + return getExpandedDimSize( + builder, + loc, + llvmIndexType, + outDimIndex, + outStaticShape, + inDesc, + inStaticShape, + reassociation, + outDimToInDimMap); + })); +} + +/// Helper function to convert a vector of `OpFoldResult`s into a vector of `Value`s. +static SmallVector getResultAsValues( + OpBuilder &builder, + Location loc, + Type &llvmIndexType, + ArrayRef valueOrAttrVec) +{ + return llvm::to_vector<4>( + llvm::map_range(valueOrAttrVec, [&](OpFoldResult value) -> Value { + if (auto attr = value.dyn_cast()) + { + return builder.create(loc, llvmIndexType, attr); + } + return value.get(); + })); +} + +static SmallVector getDynamicExpandedShape( + OpBuilder &builder, + Location loc, + Type &llvmIndexType, + ArrayRef reassociation, + ArrayRef inStaticShape, + MemRefDescriptor &inDesc, + ArrayRef outStaticShape) +{ + return getResultAsValues( + builder, + loc, + llvmIndexType, + getExpandedShape( + builder, + loc, + llvmIndexType, + reassociation, + inStaticShape, + inDesc, + outStaticShape)); +} + +bool isStrideOrOffsetStatic(int64_t strideOrOffset) +{ + return !ShapedType::isDynamicStrideOrOffset(strideOrOffset); +} + +struct ExpandShapeOpLowering : public ConvertOpToLLVMPattern +{ +public: + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + using ReshapeOpAdaptor = typename memref::ExpandShapeOp::Adaptor; + + LogicalResult matchAndRewrite( + memref::ExpandShapeOp reshapeOp, + ReshapeOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override + { + MemRefType dstType = reshapeOp.getResultType(); + MemRefType srcType = reshapeOp.getSrcType(); + + int64_t offset; + SmallVector strides; + if (failed(getStridesAndOffset(dstType, strides, offset))) + { + return rewriter.notifyMatchFailure(reshapeOp, "failed to get stride and offset exprs"); + } + + MemRefDescriptor srcDesc(adaptor.src()); + Location loc = reshapeOp->getLoc(); + auto dstDesc = MemRefDescriptor::undef(rewriter, loc, this->typeConverter->convertType(dstType)); + + dstDesc.setAllocatedPtr(rewriter, loc, srcDesc.allocatedPtr(rewriter, loc)); + dstDesc.setAlignedPtr(rewriter, loc, srcDesc.alignedPtr(rewriter, loc)); + dstDesc.setOffset(rewriter, loc, srcDesc.offset(rewriter, loc)); + + ArrayRef srcStaticShape = srcType.getShape(); + ArrayRef dstStaticShape = dstType.getShape(); + Type llvmIndexType = this->typeConverter->convertType(rewriter.getIndexType()); + + SmallVector dstShape = getDynamicExpandedShape( + rewriter, + loc, + llvmIndexType, + reshapeOp.getReassociationIndices(), + srcStaticShape, + srcDesc, + dstStaticShape); + + for (auto &shape : llvm::enumerate(dstShape)) + { + dstDesc.setSize(rewriter, loc, shape.index(), shape.value()); + } + + if (llvm::all_of(strides, isStrideOrOffsetStatic)) + { + for (auto &stride : llvm::enumerate(strides)) + { + dstDesc.setConstantStride(rewriter, loc, stride.index(), stride.value()); + } + } + else if (srcType.getLayout().isIdentity() && dstType.getLayout().isIdentity()) + { + Value stride = rewriter.create(loc, llvmIndexType, rewriter.getIndexAttr(1)); + for (auto dimIndex : llvm::reverse(llvm::seq(0, dstShape.size()))) + { + dstDesc.setStride(rewriter, loc, dimIndex, stride); + stride = rewriter.create(loc, dstShape[dimIndex], stride); + } + } + else + { + // There could be mixed static/dynamic strides. For simplicity, we + // recompute all strides if there is at least one dynamic stride. + // See comments for computeExpandedLayoutMap in llvm source code + // for details on how the strides are calculated. + for (auto &dimArray : llvm::enumerate(reshapeOp.getReassociationIndices())) + { + auto currentStrideToExpand = srcDesc.stride(rewriter, loc, dimArray.index()); + for (auto dstIndex : llvm::reverse(dimArray.value())) + { + dstDesc.setStride(rewriter, loc, dstIndex, currentStrideToExpand); + Value size = dstDesc.size(rewriter, loc, dstIndex); + currentStrideToExpand = rewriter.create(loc, size, currentStrideToExpand); + } + } + + } + rewriter.replaceOp(reshapeOp, {dstDesc}); + return success(); + } +}; + } // namespace using namespace accera::transforms::value; @@ -1569,6 +1784,7 @@ void ValueToLLVMLoweringPass::runOnModule() RewritePatternSet patterns(&getContext()); populateValueToLLVMMemPatterns(llvmTypeConverter, patterns); + populateReshapeOpToLLVMMemPatterns(llvmTypeConverter, patterns); populateMathToLLVMConversionPatterns(llvmTypeConverter, patterns); populateMemRefToLLVMConversionPatterns(llvmTypeConverter, patterns); populateStdToLLVMConversionPatterns(llvmTypeConverter, patterns); @@ -1662,6 +1878,11 @@ void populateValueToLLVMMemPatterns(mlir::LLVMTypeConverter& typeConverter, mlir patterns.insert(typeConverter, context); } +void populateReshapeOpToLLVMMemPatterns(mlir::LLVMTypeConverter& typeConverter, mlir::RewritePatternSet& patterns) +{ + patterns.insert(typeConverter); +} + const mlir::LowerToLLVMOptions& GetDefaultAcceraLLVMOptions(mlir::MLIRContext* context) { static LowerToLLVMOptions options(context); // statically allocated default we hand out copies to diff --git a/accera/transforms/src/value/ValueToStandardLoweringPass.cpp b/accera/transforms/src/value/ValueToStandardLoweringPass.cpp index 40312d54..c8d8f0a8 100644 --- a/accera/transforms/src/value/ValueToStandardLoweringPass.cpp +++ b/accera/transforms/src/value/ValueToStandardLoweringPass.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -469,6 +470,46 @@ struct PrintProfileResultsOpLowering : public OpRewritePattern { + mlir::OpBuilder::InsertPoint GetPreferredAllocInsertionPoint(ValueAllocOp op) const + { + auto operands = op.getOperation()->getOperands(); + // Find the operand which is dominated by all the other operands and the parentFuncOp + // This is therefore the last / deepest operand and the alloc should occur after it + auto parentFuncOp = op->getParentOfType(); + DominanceInfo domInfo(parentFuncOp); + mlir::Operation* currentLeastDominantOperation = parentFuncOp; + bool insertInsideOperandBlock = true; + for (auto operand : operands) + { + mlir::Operation* currentOp; + bool currentOpIsBlockArg = operand.isa(); + if (currentOpIsBlockArg) + { + auto blockArg = operand.cast(); + currentOp = blockArg.getOwner()->getParentOp(); + } + else + { + currentOp = operand.getDefiningOp(); + } + if (domInfo.dominates(currentLeastDominantOperation, currentOp)) + { + currentLeastDominantOperation = currentOp; + insertInsideOperandBlock = currentOpIsBlockArg; + } + } + if (insertInsideOperandBlock) + { + auto& block = currentLeastDominantOperation->getRegion(0).front(); + return mlir::OpBuilder::InsertPoint(&block, block.begin()); + } + else + { + auto block = currentLeastDominantOperation->getBlock(); + return mlir::OpBuilder::InsertPoint(block, ++mlir::Block::iterator(currentLeastDominantOperation)); + } + } + using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(ValueAllocOp op, PatternRewriter& rewriter) const final @@ -483,34 +524,31 @@ struct AllocOpLowering : public OpRewritePattern auto memrefType = op.getType(); auto allocType = op.allocType().getValueOr(vir::MemoryAllocType::Global); + if (memrefType.getNumDynamicDims() != 0) + { + // dynamic allocations must all be heap allocations + allocType = vir::MemoryAllocType::Heap; + } OpBuilder::InsertionGuard guard(rewriter); - auto parentFuncOp = op->getParentOfType(); + auto insertPoint = GetPreferredAllocInsertionPoint(op); + mlir::memref::AllocOp allocOp; mlir::Block* parentBlock; mlir::Value allocatedMemref; switch (allocType) { case vir::MemoryAllocType::Global: { - if (memrefType.getNumDynamicDims() == 0) - { - auto globalOp = irutil::CreateGlobalBufferOp(rewriter, op, MemRefType::Builder{ memrefType }.setLayout({}), kGlobalOpSymNameFormat); - rewriter.replaceOpWithNewOp(op, memrefType, globalOp.sym_name()); - } - else - { - rewriter.replaceOpWithNewOp(op, memrefType, op.getOperation()->getOperands(), op.alignmentAttr()); - } + auto globalOp = irutil::CreateGlobalBufferOp(rewriter, op, MemRefType::Builder{ memrefType }.setLayout({}), kGlobalOpSymNameFormat); + rewriter.replaceOpWithNewOp(op, memrefType, globalOp.sym_name()); } break; case vir::MemoryAllocType::Stack: - // Create the stack allocation at the beginning of the function - rewriter.setInsertionPointToStart(&parentFuncOp.front()); + rewriter.restoreInsertionPoint(insertPoint); rewriter.replaceOpWithNewOp(op, MemRefType::Builder{ memrefType }.setLayout({}), mlir::ValueRange{}, op.alignmentAttr()); break; case vir::MemoryAllocType::Heap: - // Create the heap allocation at the beginning of the function - rewriter.setInsertionPointToStart(&parentFuncOp.front()); + rewriter.restoreInsertionPoint(insertPoint); allocOp = rewriter.replaceOpWithNewOp(op, memrefType, op.getOperation()->getOperands(), op.alignmentAttr()); // Create a dealloc op at the end of the block containing this alloc op @@ -563,8 +601,8 @@ struct CastOpLowering : public OpRewritePattern auto fromType = op.source().getType(); auto toType = op.result().getType(); - auto isFromTypeVector = fromType.isa(); - auto isToTypeVector = toType.isa(); + [[maybe_unused]] auto isFromTypeVector = fromType.isa(); + [[maybe_unused]] auto isToTypeVector = toType.isa(); assert(isFromTypeVector == isToTypeVector && "Can only cast vectors to vectors or scalars to scalars"); auto fromElementType = util::GetElementType(fromType); @@ -2520,7 +2558,7 @@ LogicalResult EnterProfileRegionOpLowering::matchAndRewrite(EnterProfileRegionOp auto millisecondsInSecond = rewriter.create(loc, util::GetValAttr(rewriter, currentTime.getType(), 1000)); mlir::Value newCurrentTime = rewriter.create(loc, vir::BinaryOpPredicate::MUL, currentTime, millisecondsInSecond); - + rewriter.create(loc, newCurrentTime, startTimeRef); rewriter.eraseOp(op); return success(); @@ -2728,7 +2766,7 @@ LogicalResult VhaddLowering::matchAndRewrite(ValueVHADDOp op, PatternRewriter& r auto rhs = op.rhs(); auto vecType = lhs.getType().cast(); - auto rank = vecType.getRank(); + [[maybe_unused]] auto rank = vecType.getRank(); assert(rank == 1 && "vhadd only supports rank-1 vectors"); auto elementType = vecType.getElementType(); auto elementCount = vecType.getNumElements(); @@ -2911,8 +2949,8 @@ void populateValueToStandardPatterns(bool enableProfiling, ProfileRegions& profi VhaddLowering>(context); patterns.insert(context, enableProfiling, profileRegions); + PrintProfileResultsOpLowering, + ExitProfileRegionOpLowering>(context, enableProfiling, profileRegions); } std::unique_ptr> createValueToStdPass(bool enableProfiling) diff --git a/accera/transforms/src/vectorization/VectorizationUtil.cpp b/accera/transforms/src/vectorization/VectorizationUtil.cpp index 51adc344..9976702d 100644 --- a/accera/transforms/src/vectorization/VectorizationUtil.cpp +++ b/accera/transforms/src/vectorization/VectorizationUtil.cpp @@ -2885,7 +2885,7 @@ mlir::LogicalResult vectorizeInt16MatMul(mlir::AffineForOp affineForOp, // 6. add C + (A * B) if (innerLoopBodyIter == innerLoopBodyEnd || !isa(*innerLoopBodyIter)) { - return reportMatchFailure(affineForOp, "Failed to match the binary add op"); + return reportMatchFailure(affineForOp, "Failed to match the binary op"); } auto accOp = cast(*innerLoopBodyIter++); if (accOp.predicate() != v::BinaryOpPredicate::ADD) @@ -3194,7 +3194,7 @@ mlir::LogicalResult vectorizeMaskedLoadStore(mlir::AffineForOp loopOp, loadVal = thenCastOp.result(); } - // match second load op for accumulation case + // match second load op for binop case mlir::AffineLoadOp loadOp2; mlir::Value loadVal2; @@ -3216,32 +3216,42 @@ mlir::LogicalResult vectorizeMaskedLoadStore(mlir::AffineForOp loopOp, loadVal2 = thenCastOp2.result(); } - // binary add op for accumulation case - v::BinOp accOp; + // optional binary op + v::BinOp binOp; if (thenOpsIter != thenOpsEnd && isa(thenOpsIter)) { - accOp = cast(thenOpsIter++); + binOp = cast(thenOpsIter++); } - // Check that the operands for the accumulation op are in fact the values from load ops - mlir::Value accVal; - if (accOp) + // Check that the operands for the bin op are in fact the values from load ops + mlir::Value binOpVal; + if (binOp) { - if (!((accOp.lhs() == loadVal && accOp.rhs() == loadVal2) || (accOp.rhs() == loadVal && accOp.lhs() == loadVal2))) + if (!loadVal2) { - return reportMatchFailure(accOp, "Failed to match the accumulation operands"); + // Check if one of the operands to the bin op is a constant + auto otherVal = binOp.lhs() == loadVal ? binOp.rhs() : binOp.lhs(); + if (otherVal.getDefiningOp()) + { + loadVal2 = otherVal; + } } - matchedOps.push(accOp); - accVal = accOp.getResult(); + + if (!((binOp.lhs() == loadVal && binOp.rhs() == loadVal2) || (binOp.rhs() == loadVal && binOp.lhs() == loadVal2))) + { + return reportMatchFailure(binOp, "Failed to match the binOp operands"); + } + matchedOps.push(binOp); + binOpVal = binOp.getResult(); } - // optionally check if there is a cast op after accumulation op + // optionally check if there is a cast op after bin op v::CastOp thenCastOp3; if (thenOpsIter != thenOpsEnd && isa(thenOpsIter)) { thenCastOp3 = cast(thenOpsIter++); matchedOps.push(thenCastOp3); - accVal = thenCastOp3.result(); + binOpVal = thenCastOp3.result(); } // store op @@ -3390,21 +3400,37 @@ mlir::LogicalResult vectorizeMaskedLoadStore(mlir::AffineForOp loopOp, std::vector indices2(adaptor2.indices().begin(), adaptor2.indices().end()); auto [flatCastMemref2, flattenedPosition2] = FlattenAccess(rewriter, loadOp2, indices2); - mlir::Value accumulateOperand2 = rewriter.create(loadLoc2, loadVectorType2, flatCastMemref2, mlir::ValueRange{ flattenedPosition2 }, permutationMap, finalPaddingOpValue2, mask, inbounds); + mlir::Value binOperand2 = rewriter.create(loadLoc2, loadVectorType2, flatCastMemref2, mlir::ValueRange{ flattenedPosition2 }, permutationMap, finalPaddingOpValue2, mask, inbounds); // optional cast op after second vector transfer read op if (thenCastOp2) // then cast op { // Create a cast to hold vector of values auto castVecType2 = mlir::VectorType::get({ unrollMax }, thenCastOp2.getType()); - accumulateOperand2 = rewriter.create(loopOp.getLoc(), accumulateOperand2, castVecType2); + binOperand2 = rewriter.create(loopOp.getLoc(), binOperand2, castVecType2); } - // if there is a second masked load, accumulation operator must follow before final store - // create binary add op to accumulate results from first and second masked load ops - valueToStore = rewriter.create(accOp.getLoc(), v::BinaryOpPredicate::ADD, valueToStore, accumulateOperand2); + // if there is a second masked load, bin operator must follow before final store + // create binary op to combine results from first and second masked load ops + valueToStore = rewriter.create(binOp.getLoc(), binOp.getPredicate(), valueToStore, binOperand2); + + // optional cast op after bin op + if (thenCastOp3) // then cast op + { + // Create a cast to hold vector of values + auto castVecType3 = mlir::VectorType::get({ unrollMax }, thenCastOp3.getType()); + valueToStore = rewriter.create(loopOp.getLoc(), valueToStore, castVecType3); + } + } + else if (loadVal2 && loadVal2.getDefiningOp()) + { + auto constantOp = loadVal2.getDefiningOp(); + // If the second operand to the binop is a constant, broadcast to a vector and replicate the bin op + auto vectorType = mlir::VectorType::get({ unrollMax }, constantOp.getType()); + auto broadcastConstantOp = rewriter.create(constantOp.getLoc(), vectorType, constantOp); + valueToStore = rewriter.create(binOp.getLoc(), binOp.getPredicate(), valueToStore, broadcastConstantOp); - // optional cast op after accumulation op + // optional cast op after bin op if (thenCastOp3) // then cast op { // Create a cast to hold vector of values diff --git a/accera/utilities/test/src/MemoryLayout_test.cpp b/accera/utilities/test/src/MemoryLayout_test.cpp index c197edae..b125f9f0 100644 --- a/accera/utilities/test/src/MemoryLayout_test.cpp +++ b/accera/utilities/test/src/MemoryLayout_test.cpp @@ -92,7 +92,9 @@ TEST_CASE("TestMemoryLayoutCtors") TEST_CASE("TestMemoryLayoutSlice") { constexpr int64_t rows = 3, columns = 5, channels = 7, outerExtent = 4; - auto physicalSize = GENERATE_COPY(chunk(GENERATE(range(1, 4)), values({ rows, columns, channels, outerExtent }))); + // BUGBUG: causes infinite loop in Catch2 3.3.1 + // auto physicalSize = GENERATE_COPY(chunk(GENERATE(range(1, 4)), values({ rows, columns, channels, outerExtent }))); + auto physicalSize = GENERATE_COPY(chunk(2, values({ rows, columns, channels, outerExtent }))); std::vector order(physicalSize.size()); std::iota(order.begin(), order.end(), 0); @@ -107,17 +109,10 @@ TEST_CASE("TestMemoryLayoutSlice") auto sliced = layout.GetSliceLayout(sliceDimension); CHECK(sliced.NumDimensions() == (layout.NumDimensions() - 1)); - CHECKED_IF(sliceDimension == 0) - { - CHECK(sliced.NumElements() == (layout.NumElements() / layout.GetExtent(0))); - } - CHECKED_ELSE(sliceDimension == 0) - { - CHECK(sliced.NumElements() == (layout.NumElements() / layout.GetExtent(0))); - } + CHECK(sliced.NumElements() == (layout.NumElements() / layout.GetExtent(sliceDimension))); auto slicedNumDimensions = sliced.NumDimensions(); - CHECKED_ELSE(slicedNumDimensions == 0) + CHECKED_IF(slicedNumDimensions == 0) { auto dimension = GENERATE_COPY(range(zero, slicedNumDimensions)); CHECKED_IF(dimension < sliceDimension) diff --git a/accera/value/include/Scalar.h b/accera/value/include/Scalar.h index ad57277b..5498b602 100644 --- a/accera/value/include/Scalar.h +++ b/accera/value/include/Scalar.h @@ -46,7 +46,7 @@ namespace value Scalar(Scalar&&) noexcept; Scalar& operator=(const Scalar&); Scalar& operator=(Scalar&&) noexcept; - ~Scalar(); + virtual ~Scalar(); /// Gets the underlying wrapped Value instance Value GetValue() const; diff --git a/accera/value/include/ScalarDimension.h b/accera/value/include/ScalarDimension.h index de8a225e..14db7bfa 100644 --- a/accera/value/include/ScalarDimension.h +++ b/accera/value/include/ScalarDimension.h @@ -17,6 +17,7 @@ namespace value ScalarDimension(Role role = Role::Input); ScalarDimension(const std::string& name, Role role = Role::Input); ScalarDimension(Value value, const std::string& name = "", Role role = Role::Input); + ~ScalarDimension(); virtual void SetValue(Value value) final; }; diff --git a/accera/value/src/MLIREmitterContext.cpp b/accera/value/src/MLIREmitterContext.cpp index 8e6a15bc..ad3a338a 100644 --- a/accera/value/src/MLIREmitterContext.cpp +++ b/accera/value/src/MLIREmitterContext.cpp @@ -1014,7 +1014,7 @@ static accera::ir::value::MemoryAllocType AllocateFlagToAllocateType(accera::val MAP_FLAGS(Heap, Heap); // MAP_FLAGS(ThreadLocal, ThreadLocal); // Not implemented default: - assert(false); + llvm_unreachable("Unknown allocation flag"); } #undef MAP_PREDICATE @@ -1172,7 +1172,7 @@ EmitterContext::DefinedFunction MLIRContext::CreateFunctionImpl(FunctionDeclarat auto isPublic = decl.IsPublic(); auto funcTarget = decl.Target(); auto funcRuntime = decl.Runtime(); - auto isGpu = std::holds_alternative(funcTarget); + [[maybe_unused]] auto isGpu = std::holds_alternative(funcTarget); const auto& argTypes = decl.GetParameterTypes(); const auto& returnType = decl.GetReturnType(); @@ -1977,7 +1977,7 @@ Value MLIRContext::ViewImpl(Value sourceValue, const std::vector& offset std::vector sizes; std::vector strides; auto convertValueToMLIRIndexValue = [&](int64_t sentinelValue) { - return [&](Scalar scalarValue) -> mlir::Value { + return [&, sentinelValue](Scalar scalarValue) -> mlir::Value { auto mlirVal = ToMLIRValue(builder, scalarValue); if (auto constantIndex = mlirVal.getDefiningOp()) { @@ -2102,6 +2102,18 @@ Value MLIRContext::ReinterpretCastImpl(Value input, ValueType valueType) auto outputElementBitwidth = outputMlirElemType.getIntOrFloatBitWidth(); auto outputElementBytewidth = outputElementBitwidth / 8; + // Special case where the input element type and target element type have the same bitwidth + // Then the layout doesn't change + if (inputElementBytewidth == outputElementBytewidth) + { + mlir::MemRefType::Builder outputTypeBuilder(inputMemrefType); + outputTypeBuilder.setElementType(outputMlirElemType); + mlir::MemRefType outputMemRefType = outputTypeBuilder; + auto returnVal = builder.create(loc, inputMlir, outputMemRefType); + + return Wrap(returnVal, input.GetLayout()); + } + auto d0 = mlir::getAffineDimExpr(0, mlirCtx); auto d1 = mlir::getAffineDimExpr(1, mlirCtx); diff --git a/accera/value/src/ScalarDimension.cpp b/accera/value/src/ScalarDimension.cpp index 6a6bbdb5..5236f3ec 100644 --- a/accera/value/src/ScalarDimension.cpp +++ b/accera/value/src/ScalarDimension.cpp @@ -23,5 +23,8 @@ namespace value { Scalar::SetValue(value); } + + ScalarDimension::~ScalarDimension() = default; + } // namespace value } // namespace accera \ No newline at end of file