Squashed commit of the following:

commit 18db5586d1a45f65fd98ea1a21d5fb87db5d2dbf Author: Lisa Ong <[email protected]> Date: Thu Mar 16 03:46:54 2023 +0000 Merged PR 3160: [security] bump onnx to 1.13.0 This resolves a high severity dependabot alert commit 07d16bf787bffa3be93dd7902a402e7e5e660596 Author: Mason Remy <[email protected]> Date: Thu Mar 16 02:17:51 2023 +0000 Merged PR 3157: Dynamic split dim tests Dynamic split dim tests commit 7c5b9a18adbba2ec10461118fb061365e34f5ed0 Author: Denny Sun <[email protected]> Date: Wed Mar 15 01:47:45 2023 +0000 Merged PR 3158: Do not unroll the profiling ops when vectorization enabled when vectorization is enabled, the ops in kernel get unrolled, for example, without this fix the timer added to inner kernel will have 8 copies, which is definitely wrong. commit df217f2e731c2609674da57662eaf1ed6b4a40b0 Author: Denny Sun <[email protected]> Date: Mon Mar 13 06:18:41 2023 +0000 Merged PR 3153: Fix the lowering issue of the profiling ops With this fix the kernel level profiling support can work end to end. Here is some example about how to use it: ``` @tile_nest.iteration_logic def _tile_logic(): EnterProfileRegion("pack_b_fn_outer") pack_b_fn(B, B_temp, j, k) ExitProfileRegion("pack_b_fn_outer") EnterProfileRegion("matmul_fn_outer") matmul_fn(A, B, C, B_temp, i, j, k) ExitProfileRegion("matmul_fn_outer") PrintProfileResults() ``` The timings printed out look like: ``` matmul_fn_outer 1 0.000100 ms pack_b_fn_outer 1 0.000400 ms matmul_fn_outer 2 0.000400 ms pack_b_fn_outer 2 0.001200 ms matmul_fn_outer 3 0.000600 ms pack_b_fn_outer 3 0.001700 ms matmul_fn_outer 4 0.000800 ms pack_b_fn_outer 4 0.002300 ms matmul_fn_outer 5 0.000900 ms pack_b_fn_outer 5 0.002700 ms matmul_fn_outer 6 0.001200 ms pack_b_fn_outer 6 0.003200 ms matmul_fn_outer 7 0.001500 ms pack_b_fn_outer 7 0.003700 ms matmul_fn_outer 8 0.001700 ms pack_b_fn_outer 8 0.004000 ms matmul_fn_outer 9 0.002000 ms pack_b_fn_outer 9 0.004500 ms matmul_fn_outer 10 0.002200 ms pack_b_fn_outer 10 0.004800 ms matmul_fn_outer 11 0.002400 ms pack_b_fn_outer 11 0.005300 ms matmul_fn_outer 12 0.002700 ms pack_b_fn_outer 12 0.006500 ms matmul_fn_outer 13 0.003100 ms pack_b_fn_outer 13 0.007400 ms matmul_fn_outer 14 0.003400 ms pack_b_fn_outer 14 0.007800 ms matmul_fn_outer 15 0.003700 ms pack_b_fn_outer 15 0.008300 ms matmul_fn_outer 16 0.004000 ms pack_b_fn_outer 16 0.008800 ms matmul_fn_outer 17 0.004400 ms pack_b_fn_outer 17 0.009199 ms matmul_fn_outer 18 0.004800 ms pack_b_fn_outer 18 0.009599 ms matmul_fn_outer 19 0.005100 ms pack_b_fn_outer 19 0.010099 ms matmul_fn_outer 20 0.005400 ms pack_b_fn_outer 20 0.010599 ms matmul_fn_outer 21 0.006000 ms pack_b_fn_outer 21 0.011299 ms matmul_fn_outer 22 0.006300 ms pack_b_fn_outer 22 0.011899 ms matmul_fn_outer 23 0.006500 ms pack_b_fn_outer 23 0.012299 ms matmul_fn_outer 24 0.006701 ms pack_b_fn_outer 24 0.012699 ms matmul_fn_outer 25 0.006901 ms pack_b_fn_outer 25 0.013099 ms matmul_fn_outer 26 0.007101 ms pack_b_fn_outer 26 0.013399 ms matmul_fn_outer 27 0.007300 ms pack_b_fn_outer 27 0.013799 ms matmul_fn_outer 28 0.007401 ms pack_b_fn_outer 28 0.014100 ms matmul_fn_outer 29 0.007601 ms pack_b_fn_outer 29 0.014600 ms matmul_fn_outer 30 0.007801 ms pack_b_fn_outer 30 0.015000 ms matmul_fn_outer 31 0.007901 ms pack_b_fn_outer 31 0.015399 ms matmul_fn_outer 32 0.008101 ms pack_b_fn_outer 32 0.015699 ms matmul_fn_outer 33 0.008301 ms pack_b_fn_outer 33 0.015999 ms matmul_fn_outer 34 0.008601 ms pack_b_fn_outer 34 0.016... commit 3572c2b081198e1631f2df208c07490c6d4b4bf5 Author: Lisa Ong <[email protected]> Date: Fri Mar 10 10:57:39 2023 +0000 Merged PR 3152: [nfc] [test] Skip fast_exp mlas tests on unsupported Aarch64 These tests generate `llvm.x86.avx.max.ps.256` which is not supported on non-intel processors like Apple M1 ``` %28 = load <8 x float>, <8 x float>* %27, align 4, !dbg !19 %29 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %28, <8 x float> <float 0xC0561814A0000000, float 0xC0561814A0000000, float 0xC0561814A0000000, float 0xC0561814A0000000, float 0xC0561814A0000000, float 0xC0561814A0000000, float 0xC0561814A0000000, float 0xC0561814A0000000>), !dbg !20 %30 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %29, <8 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, <8 x float> <float 0x4168000000000000, float 0x4168000000000000, float 0x4168000000000000, float 0x4168000000000000, float 0x4168000000000000, float 0x4168000000000000, float 0x4168000000000000, float 0x4168000000000000>), !dbg !21 %31 = fsub <8 x float> %30, <float 0x4168000000000000, float 0x4168000000000000, float 0x4168000000000000, float 0x4168000000000000, float 0x4168000000000000, float 0x4168000000000000, float 0x4168000000000000, float 0x4168000000000000>, !dbg !22 ```
microsoft · Mar 16, 2023 · f55e4d8 · f55e4d8
1 parent 7ac5fe4
commit f55e4d8
Show file tree

Hide file tree

Showing 8 changed files with 261 additions and 93 deletions.
diff --git a/accera/onnx-emitter/test/requirements.txt b/accera/onnx-emitter/test/requirements.txt
@@ -1,4 +1,4 @@
-onnx==1.9.0
+onnx==1.13.0
 onnxruntime==1.7.0
 protobuf==3.20.2 # tied to the version of onnx above
 sympy
diff --git a/accera/python/accera/test/dsl_tests.py b/accera/python/accera/test/dsl_tests.py
@@ -6413,7 +6413,6 @@ def _verify_func(
                     after=correctness_check_values["post"],
                 )
 
-    @expectedFailure(FailedReason.NOT_IN_CORE, "Fail to lower to llvm")
     def test_profiling_nested_function_calls(self):
         test_name = "test_profiling_nested_function_calls"
 

diff --git a/accera/python/accera/test/smoke_tests.py b/accera/python/accera/test/smoke_tests.py
@@ -664,9 +664,17 @@ def _():
 
             v.check_correctness(function.name, before=(In_test, Out_test), after=(In_test, Out_ref))
 
+    @expectedFailure(
+        FailedReason.INVALID, "avx2 instructions not supported on MacOS arm64", sys.platform == "darwin"
+        and platform.machine() == "arm64"
+    )
     def test_fast_exp_mlas_w_func_level_precision(self):
         self._test_fast_exp_mlas(True)
 
+    @expectedFailure(
+        FailedReason.INVALID, "avx2 instructions not supported on MacOS arm64", sys.platform == "darwin"
+        and platform.machine() == "arm64"
+    )
     def test_fast_exp_mlas_w_pkg_level_precision(self):
         self._test_fast_exp_mlas(False)
 
@@ -1808,6 +1816,93 @@ def packed_index(i_outer, i_middle, i_inner, j_outer, j_middle, j_inner, tile_of
                                     )] = test_input[i_outer + i_middle + i_inner, j_outer + j_middle + j_inner]
             v.check_correctness(function.name, before=(test_input, test_output), after=(test_input, test_output_ref))
 
+    @expectedFailure(FailedReason.BUG, "_split_dimension of a dynamically sized dimension with a dynamic size is not working")
+    def test_dynamic_split_dim_dynamic_size(self) -> None:
+        from accera import create_dimensions
+        test_name = "test_dynamic_split_dim_dynamic_size"
+
+        M, N, MN = create_dimensions()
+
+        package = Package()
+
+        Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(MN,))
+        Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(M, N))
+
+        nest = Nest(shape=(M, N))
+        i, j = nest.get_indices()
+
+        @nest.iteration_logic
+        def _():
+            split_input = Input._split_dimension(0, N)
+            Output[i, j] = split_input[i, j]
+
+        fn = package.add(
+            nest,
+            args=(MN, M, N, Input, Output),
+            base_name=f"{test_name}_fn"
+        )
+
+        output_dir = pathlib.Path(TEST_PACKAGE_DIR) / test_name
+        shutil.rmtree(output_dir, ignore_errors=True)
+
+        with verifiers.VerifyPackage(self, test_name, output_dir) as v:
+            package.build(
+                name=test_name, format=self.PACKAGE_FORMAT, mode=self.PACKAGE_MODE, output_dir=output_dir, _quiet=False
+            )
+
+            # correctness check
+            test_M = 64
+            test_N = 16
+            test_MN = test_M*test_N
+            test_input = np.random.random([test_M*test_N]).astype(np.float32)
+            test_output = np.random.random([test_M, test_N]).astype(np.float32)
+            test_output_ref = test_input.copy().reshape((test_M, test_N))
+            v.check_correctness(function.name, before=(test_MN, test_M, test_N, test_input, test_output), after=(test_MN, test_M, test_N, test_input, test_output_ref))
+
+    @expectedFailure(FailedReason.BUG, "_split_dimension of a dynamically sized dimension with a static size is not working")
+    def test_dynamic_split_dim_static_size(self) -> None:
+        from accera import create_dimensions
+        test_name = "test_dynamic_split_dim_static_size"
+
+        M, MN = create_dimensions()
+        N = 16
+
+        package = Package()
+
+        Input = Array(role=Role.INPUT, element_type=ScalarType.float32, shape=(MN,))
+        Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(M, N))
+
+        nest = Nest(shape=(M, N))
+        i, j = nest.get_indices()
+
+        @nest.iteration_logic
+        def _():
+            split_input = Input._split_dimension(0, cast(16, ScalarType.index))
+            Output[i, j] = split_input[i, j]
+
+        fn = package.add(
+            nest,
+            args=(MN, M, Input, Output),
+            base_name=f"{test_name}_fn"
+        )
+
+        output_dir = pathlib.Path(TEST_PACKAGE_DIR) / test_name
+        shutil.rmtree(output_dir, ignore_errors=True)
+
+        with verifiers.VerifyPackage(self, test_name, output_dir) as v:
+            package.build(
+                name=test_name, format=self.PACKAGE_FORMAT, mode=self.PACKAGE_MODE, output_dir=output_dir, _quiet=False
+            )
+
+            # correctness check
+            test_M = 64
+            test_N = N
+            test_MN = test_M*test_N
+            test_input = np.random.random([test_M*test_N]).astype(np.float32)
+            test_output = np.random.random([test_M, test_N]).astype(np.float32)
+            test_output_ref = test_input.copy().reshape((test_M, test_N))
+            v.check_correctness(function.name, before=(test_MN, test_M, test_input, test_output), after=(test_MN, test_M, test_input, test_output_ref))
+
     def test_padded_nchwc_conv2d_manual_cache(self) -> None:
         input_channels = 64
         base_input_shape = (input_channels, 28, 28)    # CHW order

diff --git a/accera/transforms/include/value/ValueToStandardLoweringPass.h b/accera/transforms/include/value/ValueToStandardLoweringPass.h
@@ -19,10 +19,15 @@ class OperationPass;
 class RewritePatternSet;
 } // namespace mlir
 
+namespace
+{
+struct ProfileRegions;
+}
+
 namespace accera::transforms::value
 {
 void populateVectorizeValueOpPatterns(mlir::RewritePatternSet& patterns);
-void populateValueToStandardPatterns(bool enableProfiling, mlir::RewritePatternSet& patterns);
+void populateValueToStandardPatterns(bool enableProfiling, ProfileRegions& profileRegions, mlir::RewritePatternSet& patterns);
 void populateValueLaunchFuncPatterns(mlir::RewritePatternSet& patterns);
 void populateValueModuleRewritePatterns(mlir::RewritePatternSet& patterns);