From b020a93a109c8d5fdc97321983f142ca73a8a51d Mon Sep 17 00:00:00 2001
From: Muhammad Asif Manzoor <mmanzoor@tenstorrent.com>
Date: Mon, 24 Feb 2025 13:15:56 -0500
Subject: [PATCH] Add support for stablehlo.reduce op for logical or operator
 (#2160)

TTNN does not support reduction for logical or operator. So
stablehlo.reduce for stablehlo.or operator is decomposed into reduction
sum op along give dimension. If ttnn.sum output is zero then reduce_or
output is false; otherwise the output is true.

### Ticket
https://github.com/tenstorrent/tt-mlir/issues/1143

### Problem description
Add support for reduction operation for logical or operator

### What's changed
- `ttir.reduce_or` op is added in TTIR dialect
- `ttir.reduce_or` op is decomposed/converted to `ttir.sum` op as
tt-metal does not support reduction or operation.
- Stablehlo conversion for reduce or op.

### Checklist
- [X] New tests provide coverage for changes
---
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     | 22 ++++++++++
 .../StableHLOToTTIRPatterns.cpp               | 17 ++++++--
 .../TTIRToTTIRDecomposition.cpp               | 25 +++++++++++
 .../TTIRToTTIRDecompositionPass.cpp           |  1 +
 lib/Dialect/TTIR/IR/TTIROps.cpp               | 16 +++++++
 .../reduction/reduce_or_op.mlir               | 39 +++++++++++++++++
 .../TTIR/reduction/reduce_or.mlir             | 41 ++++++++++++++++++
 .../TTNN/reduction/simple_reduce_or.mlir      | 39 +++++++++++++++++
 .../n150/reduction/reduce_or_op.mlir          | 19 ++++++++
 .../Silicon/TTNN/n150/simple_reduce_or.mlir   | 43 +++++++++++++++++++
 10 files changed, 259 insertions(+), 3 deletions(-)
 create mode 100644 test/ttmlir/Conversion/StableHLOToTTIR/reduction/reduce_or_op.mlir
 create mode 100644 test/ttmlir/Decomposition/TTIR/reduction/reduce_or.mlir
 create mode 100644 test/ttmlir/Dialect/TTNN/reduction/simple_reduce_or.mlir
 create mode 100644 test/ttmlir/Silicon/StableHLO/n150/reduction/reduce_or_op.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/n150/simple_reduce_or.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index 8f83056edb..bfcc7a5ce3 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -806,6 +806,28 @@ def TTIR_ReduceAndOp : TTIR_ReductionOp<"reduce_and"> {
     }];
 }
 
+def TTIR_ReduceOrOp : TTIR_ReductionOp<"reduce_or"> {
+    let summary = "Or reduction op.";
+    let description = [{
+      Reduces a given tensor using logical or operator along the given dimension(s).
+
+      Example:
+        input: [[True,  False, False, False],
+                [True,  True,  False, True],
+                [False, False, False, True],
+                [False, False, False, False]]
+
+        // Reduction along dim 0
+        output: [True, True, False, True]
+
+        // Reduction along dim 1
+        output: [True, True, True, False]
+
+        // Reduction for both dimensions (entire tensor)
+        output: [True]
+    }];
+}
+
 def TTIR_ProdOp : TTIR_ReductionOp<"prod"> {
   let summary = "Product reduction op.";
   let description = [{
diff --git a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
index 334e228303..d3518a3bc4 100644
--- a/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
+++ b/lib/Conversion/StableHLOToTTIR/StableHLOToTTIRPatterns.cpp
@@ -99,6 +99,10 @@ class StableHLOToTTIRReduceOpConversionPattern
       return matchAndRewriteInternal<mlir::tt::ttir::ReduceAndOp>(
           srcOp, adaptor, rewriter);
     }
+    if (mlir::isa<mlir::stablehlo::OrOp>(innerOp)) {
+      return matchAndRewriteInternal<mlir::tt::ttir::ReduceOrOp>(srcOp, adaptor,
+                                                                 rewriter);
+    }
     if (isArgMax(srcOp, adaptor, rewriter)) {
       return matchAndRewriteInternalArgMax(srcOp, adaptor, rewriter);
     }
@@ -129,16 +133,23 @@ class StableHLOToTTIRReduceOpConversionPattern
     }
 
     mlir::Operation &innerOp = srcOp.getBody().front().front();
-    if (mlir::isa<mlir::stablehlo::AndOp>(innerOp)) {
+    if (mlir::isa<mlir::stablehlo::AndOp>(innerOp) ||
+        mlir::isa<mlir::stablehlo::OrOp>(innerOp)) {
       bool allOperandsAreBoolean = std::all_of(
           srcOp->operand_begin(), srcOp->operand_end(), [](auto operand) {
             return mlir::cast<RankedTensorType>(operand.getType())
                        .getElementTypeBitWidth() == 1;
           });
+      // Stablehlo (unlike other dialects) has single op for both logical and
+      // bitwise operation. Data type is used to distinguish between logical and
+      // bitwise operation. If the datatype is boolean then it is a logical
+      // operation; otherwise it is bitwise operation. This check ensure that
+      // the inputs are boolean as tt-metal only supports logical operations.
       if (!allOperandsAreBoolean) {
         return rewriter.notifyMatchFailure(
-            srcOp, "stablehlo.reduce for stablehlo.and operator is only "
-                   "supported for logical and.");
+            srcOp,
+            "stablehlo.reduce for stablehlo.and/stablehlo.or operator is only "
+            "supported for logical operator.");
       }
     }
 
diff --git a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
index fe9a5cb865..d2249affb9 100644
--- a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
+++ b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecomposition.cpp
@@ -1344,6 +1344,30 @@ struct ArgMaxOpKeepDimConversionPattern
 };
 } // namespace
 
+// TTNN does not support reduction operation for logical or. So this reduction
+// is performed by decomposing/converting into reduction sum (ttnn.sum op).
+// If ttnn.sum output is zero then reduce_or output is false; otherwise the
+// output is true.
+namespace {
+struct ReductionOrPattern : public OpConversionPattern<ttir::ReduceOrOp> {
+public:
+  using OpConversionPattern<ttir::ReduceOrOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::ReduceOrOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    RankedTensorType reduceOutputType = mlir::cast<RankedTensorType>(
+        getTypeConverter()->convertType(op.getResult().getType()));
+
+    ttmlir::utils::replaceOpWithNewDPSOp<ttir::SumOp>(
+        rewriter, op, reduceOutputType, adaptor.getInput(), op.getKeepDim(),
+        op.getDimArgAttr());
+
+    return success();
+  }
+};
+} // namespace
+
 void populateTTIRToTTIRDecompositionPatterns(MLIRContext *ctx,
                                              RewritePatternSet &patterns,
                                              TypeConverter &typeConverter) {
@@ -1356,6 +1380,7 @@ void populateTTIRToTTIRDecompositionPatterns(MLIRContext *ctx,
   patterns.add<ArangeForceLastDimensionPattern>(typeConverter, ctx);
   patterns.add<DotGeneralToMatmulConversionPattern>(typeConverter, ctx);
   patterns.add<ReductionAndPattern>(typeConverter, ctx);
+  patterns.add<ReductionOrPattern>(typeConverter, ctx);
   patterns.add<ArgMaxOpKeepDimConversionPattern>(typeConverter, ctx);
 }
 
diff --git a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecompositionPass.cpp b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecompositionPass.cpp
index 0ef642bbed..59befd3192 100644
--- a/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecompositionPass.cpp
+++ b/lib/Conversion/TTIRToTTIRDecomposition/TTIRToTTIRDecompositionPass.cpp
@@ -54,6 +54,7 @@ struct TTIRToTTIRDecompositionPass
     target.addIllegalOp<ttir::SelectOp>();
     target.addIllegalOp<ttir::DotGeneralOp>();
     target.addIllegalOp<ttir::ReduceAndOp>();
+    target.addIllegalOp<ttir::ReduceOrOp>();
 
     // These are the ops that must satisfy some conditions after this pass
     target.addDynamicallyLegalOp<ttir::ArangeOp>([&](ttir::ArangeOp op) {
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index 798af3c1cc..64087a92a6 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -2563,6 +2563,22 @@ ::mlir::LogicalResult mlir::tt::ttir::ReduceAndOp::verify() {
   return verifyReduceOp(getOperation(), getInput().getType(), getDimArg());
 }
 
+//===----------------------------------------------------------------------===//
+// ReduceOrOp
+//===----------------------------------------------------------------------===//
+
+// ReduceOrOp kernel builder.
+void mlir::tt::ttir::ReduceOrOp::buildGenericRegion(
+    ::mlir::OpBuilder &opBuilder, ::mlir::Block *block) {
+  // NOLINTNEXTLINE
+  createReduceOp(opBuilder, block, getLoc(), "or");
+}
+
+// ReduceOrOp verification.
+::mlir::LogicalResult mlir::tt::ttir::ReduceOrOp::verify() {
+  return verifyReduceOp(getOperation(), getInput().getType(), getDimArg());
+}
+
 //===----------------------------------------------------------------------===//
 // Reduce ArgMaxOp
 //===----------------------------------------------------------------------===//
diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/reduction/reduce_or_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/reduction/reduce_or_op.mlir
new file mode 100644
index 0000000000..c4b63543c3
--- /dev/null
+++ b/test/ttmlir/Conversion/StableHLOToTTIR/reduction/reduce_or_op.mlir
@@ -0,0 +1,39 @@
+// REQUIRES: stablehlo
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s
+module @jit_reduce_or attributes {} {
+  func.func public @test_reduce_or_4to3dim(%arg0: tensor<128x10x32x4xi1>, %cst_0: tensor<i1>) -> tensor<128x10x32xi1> {
+    // CHECK-LABEL: func.func public @test_reduce_or_4to3dim
+    // CHECK: tensor.empty
+    // CHECK: "ttir.reduce_or"
+    // CHECK-SAME: dim_arg = [3 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x32x4xbf16>
+    // CHECK-SAME: -> tensor<128x10x32xbf16>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.or across dimensions = [3] : (tensor<128x10x32x4xi1>, tensor<i1>) -> tensor<128x10x32xi1>
+    return %0 : tensor<128x10x32xi1>
+  }
+
+  func.func public @test_reduce_or_3to2dim(%arg0: tensor<128x10x4xi1>, %cst_0: tensor<i1>) -> tensor<128x4xi1> {
+    // CHECK-LABEL: func.func public @test_reduce_or_3to2dim
+    // CHECK: tensor.empty
+    // CHECK: "ttir.reduce_or"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x4xbf16>
+    // CHECK-SAME: -> tensor<128x4xbf16>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.or across dimensions = [1] : (tensor<128x10x4xi1>, tensor<i1>) -> tensor<128x4xi1>
+    return %0 : tensor<128x4xi1>
+  }
+
+  func.func public @test_reduce_or_2to1dim(%arg0: tensor<128x10xi1>, %cst_0: tensor<i1>) -> tensor<10xi1> {
+    // CHECK-LABEL: func.func public @test_reduce_or_2to1dim
+    // CHECK: tensor.empty
+    // CHECK: "ttir.reduce_or"
+    // CHECK-SAME: dim_arg = [0 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10xbf16>
+    // CHECK-SAME: -> tensor<10xbf16>
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.or across dimensions = [0] : (tensor<128x10xi1>, tensor<i1>) -> tensor<10xi1>
+    return %0 : tensor<10xi1>
+  }
+}
diff --git a/test/ttmlir/Decomposition/TTIR/reduction/reduce_or.mlir b/test/ttmlir/Decomposition/TTIR/reduction/reduce_or.mlir
new file mode 100644
index 0000000000..1225427c19
--- /dev/null
+++ b/test/ttmlir/Decomposition/TTIR/reduction/reduce_or.mlir
@@ -0,0 +1,41 @@
+// RUN: ttmlir-opt --ttir-to-ttir-decomposition %s | FileCheck %s
+module attributes {} {
+  func.func public @test_reduce_or_4to3dim(%arg0: tensor<128x10x32x4xbf16>, %arg1: tensor<1xbf16>) -> tensor<128x10x32xbf16> {
+    // CHECK-LABEL: func.func public @test_reduce_or_4to3dim
+    // CHECK: %[[SUM:[0-9]+]] = "ttir.sum"
+    // CHECK-SAME: dim_arg = [3 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x32x4xbf16>
+    // CHECK-SAME: -> tensor<128x10x32xbf16>
+    // CHECK: return %[[SUM]]
+    %0 = tensor.empty() : tensor<128x10x32xbf16>
+    %1 = "ttir.reduce_or"(%arg0, %0) <{dim_arg = [3 : i32], keep_dim = false}> : (tensor<128x10x32x4xbf16>, tensor<128x10x32xbf16>) -> tensor<128x10x32xbf16>
+    return %1 : tensor<128x10x32xbf16>
+  }
+
+  func.func public @test_reduce_or_3to2dim(%arg0: tensor<128x10x4xbf16>, %arg1: tensor<1xbf16>) -> tensor<128x4xbf16> {
+    // CHECK-LABEL: func.func public @test_reduce_or_3to2dim
+    // CHECK: %[[SUM:[0-9]+]] = "ttir.sum"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x4xbf16>
+    // CHECK-SAME: -> tensor<128x4xbf16>
+    // CHECK: return %[[SUM]]
+    %0 = tensor.empty() : tensor<128x4xbf16>
+    %1 = "ttir.reduce_or"(%arg0, %0) <{dim_arg = [1 : i32], keep_dim = false}> : (tensor<128x10x4xbf16>, tensor<128x4xbf16>) -> tensor<128x4xbf16>
+    return %1 : tensor<128x4xbf16>
+  }
+
+  func.func public @test_reduce_or_2to1dim(%arg0: tensor<128x10xbf16>, %arg1: tensor<1xbf16>) -> tensor<10xbf16> {
+    // CHECK-LABEL: func.func public @test_reduce_or_2to1dim
+    // CHECK: %[[SUM:[0-9]+]] = "ttir.sum"
+    // CHECK-SAME: dim_arg = [0 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10xbf16>
+    // CHECK-SAME: -> tensor<10xbf16>
+    // CHECK: return %[[SUM]]
+    %0 = tensor.empty() : tensor<10xbf16>
+    %1 = "ttir.reduce_or"(%arg0, %0) <{dim_arg = [0 : i32], keep_dim = false}> : (tensor<128x10xbf16>, tensor<10xbf16>) -> tensor<10xbf16>
+    return %1 : tensor<10xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/reduction/simple_reduce_or.mlir b/test/ttmlir/Dialect/TTNN/reduction/simple_reduce_or.mlir
new file mode 100644
index 0000000000..1d269ac676
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/reduction/simple_reduce_or.mlir
@@ -0,0 +1,39 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
+
+module attributes {} {
+  func.func public @test_reduce_or_4to3dim(%arg0: tensor<128x10x32x4xbf16>, %arg1: tensor<1xbf16>) -> tensor<128x10x32xbf16> {
+    // CHECK-LABEL: func.func public @test_reduce_or_4to3dim
+    // CHECK: %[[SUM:[0-9]+]] = "ttnn.sum"
+    // CHECK-SAME: dim_arg = [3 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x32x4xbf16,
+    // CHECK-SAME: -> tensor<128x10x32xbf16,
+    %0 = tensor.empty() : tensor<128x10x32xbf16>
+    %1 = "ttir.reduce_or"(%arg0, %0) <{dim_arg = [3 : i32], keep_dim = false}> : (tensor<128x10x32x4xbf16>, tensor<128x10x32xbf16>) -> tensor<128x10x32xbf16>
+    return %1 : tensor<128x10x32xbf16>
+  }
+
+  func.func public @test_reduce_or_3to2dim(%arg0: tensor<128x10x4xbf16>, %arg1: tensor<1xbf16>) -> tensor<128x4xbf16> {
+    // CHECK-LABEL: func.func public @test_reduce_or_3to2dim
+    // CHECK: %[[SUM:[0-9]+]] = "ttnn.sum"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x4xbf16,
+    // CHECK-SAME: -> tensor<128x4xbf16,
+    %0 = tensor.empty() : tensor<128x4xbf16>
+    %1 = "ttir.reduce_or"(%arg0, %0) <{dim_arg = [1 : i32], keep_dim = false}> : (tensor<128x10x4xbf16>, tensor<128x4xbf16>) -> tensor<128x4xbf16>
+    return %1 : tensor<128x4xbf16>
+  }
+
+  func.func public @test_reduce_or_2to1dim(%arg0: tensor<128x10xbf16>, %arg1: tensor<1xbf16>) -> tensor<10xbf16> {
+    // CHECK-LABEL: func.func public @test_reduce_or_2to1dim
+    // CHECK: %[[SUM:[0-9]+]] = "ttnn.sum"
+    // CHECK-SAME: dim_arg = [0 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10xbf16,
+    // CHECK-SAME: -> tensor<10xbf16,
+    %0 = tensor.empty() : tensor<10xbf16>
+    %1 = "ttir.reduce_or"(%arg0, %0) <{dim_arg = [0 : i32], keep_dim = false}> : (tensor<128x10xbf16>, tensor<10xbf16>) -> tensor<10xbf16>
+    return %1 : tensor<10xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/StableHLO/n150/reduction/reduce_or_op.mlir b/test/ttmlir/Silicon/StableHLO/n150/reduction/reduce_or_op.mlir
new file mode 100644
index 0000000000..4584f323f2
--- /dev/null
+++ b/test/ttmlir/Silicon/StableHLO/n150/reduction/reduce_or_op.mlir
@@ -0,0 +1,19 @@
+// REQUIRES: stablehlo
+// RUN: rm -rf %t.ttnn
+// RUN: rm -rf %t.mlir
+// RUN: ttmlir-opt --stablehlo-to-ttir-pipeline \
+// RUN:     --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// RUN: FileCheck --input-file=%t.mlir %s
+
+module @jit_reduce_add attributes {} {
+  func.func public @test_reduce_or_4to3dim(%arg0: tensor<128x10x32x4xi1>, %cst_0: tensor<i1>) -> tensor<128x10x32xi1> {
+    // CHECK-LABEL: func.func public @test_reduce_or_4to3dim
+    // CHECK: "ttnn.sum"
+    // CHECK-SAME: dim_arg = [3 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: -> tensor<128x10x32xbf16,
+    %0 = stablehlo.reduce(%arg0 init: %cst_0) applies stablehlo.or across dimensions = [3] : (tensor<128x10x32x4xi1>, tensor<i1>) -> tensor<128x10x32xi1>
+    return %0 : tensor<128x10x32xi1>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/n150/simple_reduce_or.mlir b/test/ttmlir/Silicon/TTNN/n150/simple_reduce_or.mlir
new file mode 100644
index 0000000000..e754cfac84
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/n150/simple_reduce_or.mlir
@@ -0,0 +1,43 @@
+// RUN: rm -rf %t.ttnn
+// RUN: rm -rf %t.mlir
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+module attributes {} {
+  func.func public @test_reduce_or_4to2dim(%arg0: tensor<128x10x32x4xbf16>, %arg1: tensor<1xbf16>) -> tensor<128x32xbf16> {
+    // CHECK-LABEL: func.func public @test_reduce_or_4to2dim
+    // CHECK: %[[SUM:[0-9]+]] = "ttnn.sum"
+    // CHECK-SAME: dim_arg = [1 : i32, 3 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x32x4xbf16,
+    // CHECK-SAME: -> tensor<128x32xbf16,
+    %0 = tensor.empty() : tensor<128x32xbf16>
+    %1 = "ttir.reduce_or"(%arg0, %0) <{dim_arg = [1: i32, 3 : i32], keep_dim = false}> : (tensor<128x10x32x4xbf16>, tensor<128x32xbf16>) -> tensor<128x32xbf16>
+    return %1 : tensor<128x32xbf16>
+  }
+
+  func.func public @test_reduce_or_3to2dim(%arg0: tensor<128x10x4xbf16>, %arg1: tensor<1xbf16>) -> tensor<128x4xbf16> {
+    // CHECK-LABEL: func.func public @test_reduce_or_3to2dim
+    // CHECK: %[[SUM:[0-9]+]] = "ttnn.sum"
+    // CHECK-SAME: dim_arg = [1 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10x4xbf16,
+    // CHECK-SAME: -> tensor<128x4xbf16,
+    %0 = tensor.empty() : tensor<128x4xbf16>
+    %1 = "ttir.reduce_or"(%arg0, %0) <{dim_arg = [1 : i32], keep_dim = false}> : (tensor<128x10x4xbf16>, tensor<128x4xbf16>) -> tensor<128x4xbf16>
+    return %1 : tensor<128x4xbf16>
+  }
+
+  func.func public @test_reduce_or_2to1dim(%arg0: tensor<128x10xbf16>, %arg1: tensor<1xbf16>) -> tensor<10xbf16> {
+    // CHECK-LABEL: func.func public @test_reduce_or_2to1dim
+    // CHECK: %[[SUM:[0-9]+]] = "ttnn.sum"
+    // CHECK-SAME: dim_arg = [0 : i32]
+    // CHECK-SAME: keep_dim = false
+    // CHECK-SAME: tensor<128x10xbf16,
+    // CHECK-SAME: -> tensor<10xbf16,
+    %0 = tensor.empty() : tensor<10xbf16>
+    %1 = "ttir.reduce_or"(%arg0, %0) <{dim_arg = [0 : i32], keep_dim = false}> : (tensor<128x10xbf16>, tensor<10xbf16>) -> tensor<10xbf16>
+    return %1 : tensor<10xbf16>
+  }
+}