NVIDIA · jacobhinkle · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 7, 2025
diff --git a/csrc/index_compute.cpp b/csrc/index_compute.cpp
@@ -20,6 +20,7 @@
 #include <expr_simplifier.h>
 #include <instrumentation.h>
 #include <ir/all_nodes.h>
+#include <ir/builder.h>
 #include <ir/iostream.h>
 #include <ir/utils.h>
 #include <logical_domain_map.h>
@@ -2717,6 +2718,14 @@ std::pair<Val*, Val*> Index::getCpAsyncBulkGmemIndex(
     auto indices_inner_to_outer =
         indexer.getIndexFor(ldst, !is_load, ids_to_index, loops);
 
+    // These are the box coordinates of the TMA box, which must be of type
+    // int32_t. Possible overflow in each of these dims should be checked
+    // elsewhere.
+    for (size_t i : c10::irange(indices_inner_to_outer.size())) {
+      indices_inner_to_outer[i] =
+          IrBuilder::maybeCastExpr(DataType::Int32, indices_inner_to_outer[i]);
+    }
+
     auto coordinate = IrBuilder::arrayExpr(indices_inner_to_outer);
     auto descriptor = tma_info.tensorMap();
     if (is_load) {

diff --git a/csrc/kernel.h b/csrc/kernel.h
@@ -16,6 +16,7 @@
 #include <ir/base_nodes.h>
 #include <ir/builder.h>
 #include <parallel_dimension_map.h>
+#include <type.h>
 #include <utils.h>
 #include <vectorization_info.h>
 #include <visibility.h>
@@ -223,6 +224,10 @@ class NVF_API Kernel final : public Fusion {
     return index_type_;
   }
 
+  void setIndexType(PrimDataType new_index_type) {
+    index_type_ = new_index_type;
+  }
+
   //! Checks if parallel type is padded
   bool isParallelTypePadded(ParallelType ptype) const {
     return ptype == ParallelType::TIDx &&