-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Convert vector function nvvm.annotations to attributes #127736
base: main
Are you sure you want to change the base?
[NVPTX] Convert vector function nvvm.annotations to attributes #127736
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-clang-codegen Author: Alex MacLean (AlexMaclean) ChangesReplace some more nvvm.annotations with function attributes, auto-upgrading the annotations as needed. These new attributes will be more idiomatic and compile-time efficient than the annotations.
Patch is 46.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127736.diff 21 Files Affected:
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index f89d32d4e13fe..75c3387c88c4b 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -353,17 +353,13 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
int32_t *MaxThreadsVal,
int32_t *MinBlocksVal,
int32_t *MaxClusterRankVal) {
- // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
llvm::APSInt MaxThreads(32);
MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
if (MaxThreads > 0) {
if (MaxThreadsVal)
*MaxThreadsVal = MaxThreads.getExtValue();
- if (F) {
- // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
- NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
- MaxThreads.getExtValue());
- }
+ if (F)
+ F->addFnAttr("nvvm.maxntid", llvm::utostr(MaxThreads.getExtValue()));
}
// min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
diff --git a/clang/test/CodeGenCUDA/launch-bounds.cu b/clang/test/CodeGenCUDA/launch-bounds.cu
index 72f7857264f8c..fba66e85040c7 100644
--- a/clang/test/CodeGenCUDA/launch-bounds.cu
+++ b/clang/test/CodeGenCUDA/launch-bounds.cu
@@ -10,23 +10,30 @@
#endif
// CHECK: @Kernel1() #[[ATTR0:[0-9]+]]
+// CHECK: @Kernel2() #[[ATTR1:[0-9]+]]
+// CHECK: @{{.*}}Kernel3{{.*}}() #[[ATTR1]]
// CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]]
-// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]]
-// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]]
-// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]]
-
-// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
-// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}}
-// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}}
-
-// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]]
-
-// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}}
-// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}}
-// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}}
+// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR2:[0-9]+]]
+// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR3:[0-9]+]]
+// CHECK: @{{.*}}Kernel7{{.*}}() #[[ATTR1]]
+// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR4:[0-9]+]]
+
+// CHECK-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
+
+// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR0:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR0]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR1:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel7_sm_90{{.*}} #[[ATTR2:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR3:[0-9]+]]
+
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
// Test both max threads per block and Min cta per sm.
extern "C" {
@@ -37,8 +44,6 @@ Kernel1()
}
}
-// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
-
#ifdef USE_MAX_BLOCKS
// Test max threads per block and min/max cta per sm.
extern "C" {
@@ -48,8 +53,6 @@ Kernel1_sm_90()
{
}
}
-
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
#endif // USE_MAX_BLOCKS
// Test only max threads per block. Min cta per sm defaults to 0, and
@@ -62,8 +65,6 @@ Kernel2()
}
}
-// CHECK: !{{[0-9]+}} = !{ptr @Kernel2, !"maxntidx", i32 256}
-
template <int max_threads_per_block>
__global__ void
__launch_bounds__(max_threads_per_block)
@@ -72,7 +73,6 @@ Kernel3()
}
template __global__ void Kernel3<MAX_THREADS_PER_BLOCK>();
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}
template <int max_threads_per_block, int min_blocks_per_mp>
__global__ void
@@ -82,7 +82,6 @@ Kernel4()
}
template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
#ifdef USE_MAX_BLOCKS
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@@ -93,7 +92,6 @@ Kernel4_sm_90()
}
template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
#endif //USE_MAX_BLOCKS
const int constint = 100;
@@ -106,8 +104,6 @@ Kernel5()
}
template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
-
#ifdef USE_MAX_BLOCKS
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@@ -120,7 +116,6 @@ Kernel5_sm_90()
}
template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
#endif //USE_MAX_BLOCKS
// Make sure we don't emit negative launch bounds values.
@@ -129,15 +124,12 @@ __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
Kernel6()
{
}
-// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx",
__global__ void
__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
Kernel7()
{
}
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx",
-// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm",
#ifdef USE_MAX_BLOCKS
__global__ void
@@ -145,17 +137,12 @@ __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP
Kernel7_sm_90()
{
}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx",
-// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm",
-// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank",
#endif // USE_MAX_BLOCKS
const char constchar = 12;
__global__ void __launch_bounds__(constint, constchar) Kernel8() {}
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
#ifdef USE_MAX_BLOCKS
const char constchar_2 = 14;
__global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
#endif // USE_MAX_BLOCKS
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 7eacc58549c7d..73560ae84f2f4 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -74,6 +74,29 @@ Function Attributes
This attribute indicates the maximum number of registers to be used for the
kernel function.
+``"nvvm.maxntid"="<x>"``
+``"nvvm.maxntid"="<x>,<y>"``
+``"nvvm.maxntid"="<x>,<y>,<z>"``
+ This attribute declares the maximum number of threads in the thread block
+ (CTA). The maximum number of threads is the product of the maximum extent in
+ each dimension. Exceeding the maximum number of threads results in a runtime
+ error or kernel launch failure.
+
+``"nvvm.reqntid"="<x>"``
+``"nvvm.reqntid"="<x>,<y>"``
+``"nvvm.reqntid"="<x>,<y>,<z>"``
+ This attribute declares the exact number of threads in the thread block
+ (CTA). The number of threads is the product of the value in each dimension.
+ Specifying a different CTA dimension at launch will result in a runtime
+ error or kernel launch failure.
+
+``"nvvm.cluster_dim"="<x>"``
+``"nvvm.cluster_dim"="<x>,<y>"``
+``"nvvm.cluster_dim"="<x>,<y>,<z>"``
+ This attribute declares the number of thread blocks (CTAs) in the cluster.
+ The total number of CTAs is the product of the number of CTAs in each
+ dimension. Specifying a different cluster dimension at launch will result in
+ a runtime error or kernel launch failure. Only supported for Hopper+.
.. _address_spaces:
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 04acab1e5765e..c2451747077e5 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6366,45 +6366,13 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
KernelEnvironmentGV->setInitializer(NewInitializer);
}
-static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) {
- Module &M = *Kernel.getParent();
- NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
- for (auto *Op : MD->operands()) {
- if (Op->getNumOperands() != 3)
- continue;
- auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
- if (!KernelOp || KernelOp->getValue() != &Kernel)
- continue;
- auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
- if (!Prop || Prop->getString() != Name)
- continue;
- return Op;
- }
- return nullptr;
-}
-
-static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value,
- bool Min) {
- // Update the "maxntidx" metadata for NVIDIA, or add it.
- MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
- if (ExistingOp) {
- auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
- int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
- ExistingOp->replaceOperandWith(
- 2, ConstantAsMetadata::get(ConstantInt::get(
- OldVal->getValue()->getType(),
- Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
- } else {
- LLVMContext &Ctx = Kernel.getContext();
- Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel),
- MDString::get(Ctx, Name),
- ConstantAsMetadata::get(
- ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
- // Append metadata to nvvm.annotations
- Module &M = *Kernel.getParent();
- NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
- MD->addOperand(MDNode::get(Ctx, MDVals));
+static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
+ bool Min) {
+ if (Kernel.hasFnAttribute(Name)) {
+ int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
+ Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
}
+ Kernel.addFnAttr(Name, llvm::utostr(Value));
}
std::pair<int32_t, int32_t>
@@ -6426,9 +6394,8 @@ OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
return {LB, UB};
}
- if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
- auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
- int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
+ if (Kernel.hasFnAttribute("nvvm.maxntid")) {
+ int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
}
return {0, ThreadLimit};
@@ -6445,7 +6412,7 @@ void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
return;
}
- updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
+ updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
}
std::pair<int32_t, int32_t>
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 57072715366c9..dc18ba9780fec 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -13,11 +13,13 @@
//===----------------------------------------------------------------------===//
#include "llvm/IR/AutoUpgrade.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfo.h"
@@ -46,6 +48,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
+#include <cstdint>
#include <cstring>
#include <numeric>
@@ -5021,6 +5024,36 @@ bool llvm::UpgradeDebugInfo(Module &M) {
return Modified;
}
+static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC,
+ GlobalValue *GV, const Metadata *V) {
+ Function *F = cast<Function>(GV);
+
+ constexpr StringLiteral DefaultValue = "1";
+ StringRef Vect3[3] = {DefaultValue, DefaultValue, DefaultValue};
+ unsigned Length = 0;
+
+ if (F->hasFnAttribute(Attr)) {
+ StringRef S = F->getFnAttribute(Attr).getValueAsString();
+ for (; Length < 3 && !S.empty(); Length++) {
+ auto [Part, Rest] = S.split(',');
+ Vect3[Length] = Part.trim();
+ S = Rest;
+ }
+ }
+
+ const uint64_t VInt = mdconst::extract<ConstantInt>(V)->getZExtValue();
+ const std::string VStr = llvm::utostr(VInt);
+
+ const unsigned Dim = DimC - 'x';
+ assert(Dim >= 0 && Dim < 3 && "Unexpected dim char");
+
+ Vect3[Dim] = VStr;
+ Length = std::max(Length, Dim + 1);
+
+ const std::string NewAttr = llvm::join(ArrayRef(Vect3, Length), ",");
+ F->addFnAttr(Attr, NewAttr);
+}
+
bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
const Metadata *V) {
if (K == "kernel") {
@@ -5059,6 +5092,18 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
cast<Function>(GV)->addFnAttr("nvvm.maxnreg", llvm::utostr(CV));
return true;
}
+ if (K.consume_front("maxntid") && (K == "x" || K == "y" || K == "z")) {
+ upgradeNVVMFnVectorAttr("nvvm.maxntid", K[0], GV, V);
+ return true;
+ }
+ if (K.consume_front("reqntid") && (K == "x" || K == "y" || K == "z")) {
+ upgradeNVVMFnVectorAttr("nvvm.reqntid", K[0], GV, V);
+ return true;
+ }
+ if (K.consume_front("cluster_dim_") && (K == "x" || K == "y" || K == "z")) {
+ upgradeNVVMFnVectorAttr("nvvm.cluster_dim", K[0], GV, V);
+ return true;
+ }
return false;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 75d930d9f7b6f..0c172bd2d6a7e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -27,6 +27,7 @@
#include "cl_common_defines.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallString.h"
@@ -34,6 +35,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -505,24 +507,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
// If the NVVM IR has some of reqntid* specified, then output
// the reqntid directive, and set the unspecified ones to 1.
// If none of Reqntid* is specified, don't output reqntid directive.
- std::optional<unsigned> Reqntidx = getReqNTIDx(F);
- std::optional<unsigned> Reqntidy = getReqNTIDy(F);
- std::optional<unsigned> Reqntidz = getReqNTIDz(F);
+ const auto ReqNTID = getReqNTID(F);
+ if (!ReqNTID.empty())
+ O << formatv(".reqntid {0:$[, ]}\n",
+ make_range(ReqNTID.begin(), ReqNTID.end()));
- if (Reqntidx || Reqntidy || Reqntidz)
- O << ".reqntid " << Reqntidx.value_or(1) << ", " << Reqntidy.value_or(1)
- << ", " << Reqntidz.value_or(1) << "\n";
-
- // If the NVVM IR has some of maxntid* specified, then output
- // the maxntid directive, and set the unspecified ones to 1.
- // If none of maxntid* is specified, don't output maxntid directive.
- std::optional<unsigned> Maxntidx = getMaxNTIDx(F);
- std::optional<unsigned> Maxntidy = getMaxNTIDy(F);
- std::optional<unsigned> Maxntidz = getMaxNTIDz(F);
-
- if (Maxntidx || Maxntidy || Maxntidz)
- O << ".maxntid " << Maxntidx.value_or(1) << ", " << Maxntidy.value_or(1)
- << ", " << Maxntidz.value_or(1) << "\n";
+ const auto MaxNTID = getMaxNTID(F);
+ if (!MaxNTID.empty())
+ O << formatv(".maxntid {0:$[, ]}\n",
+ make_range(MaxNTID.begin(), MaxNTID.end()));
if (const auto Mincta = getMinCTASm(F))
O << ".minnctapersm " << *Mincta << "\n";
@@ -536,21 +529,19 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
if (STI->getSmVersion() >= 90) {
- std::optional<unsigned> ClusterX = getClusterDimx(F);
- std::optional<unsigned> ClusterY = getClusterDimy(F);
- std::optional<unsigned> ClusterZ = getClusterDimz(F);
+ const auto ClusterDim = getClusterDim(F);
- if (ClusterX || ClusterY || ClusterZ) {
+ if (!ClusterDim.empty()) {
O << ".explicitcluster\n";
- if (ClusterX.value_or(1) != 0) {
- assert(ClusterY.value_or(1) && ClusterZ.value_or(1) &&
+ if (ClusterDim[0] != 0) {
+ assert(llvm::all_of(ClusterDim, [](unsigned D) { return D != 0; }) &&
"cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z "
"should be non-zero as well");
- O << ".reqnctapercluster " << ClusterX.value_or(1) << ", "
- << ClusterY.value_or(1) << ", " << ClusterZ.value_or(1) << "\n";
+ O << formatv(".reqnctapercluster {0:$[, ]}\n",
+ make_range(ClusterDim.begin(), ClusterDim.end()));
} else {
- assert(!ClusterY.value_or(1) && !ClusterZ.value_or(1) &&
+ assert(llvm::all_of(ClusterDim, [](unsigned D) { return D == 0; }) &&
"cluster_dim_x == 0 implies cluster_dim_y and cluster_dim_z "
"should be 0 as well");
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
index ae5922cba4ce3..b10e0b14118a1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
@@ -50,33 +50,10 @@ static std::string getHash(StringRef Str) {
return llvm::utohexstr(Hash.low(), /*LowerCase=*/true);
}
-static void addKernelMetadata(Module &M, Function *F) {
- llvm::LLVMContext &Ctx = M.getContext();
-
- // Get "nvvm.annotations" metadata node.
- llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
-
- // This kernel is only to be called single-threaded.
- llvm::Metadata *ThreadXMDVals[] = {
- llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidx"),
- llvm::ConstantAsMetadata::get(
- llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
- llvm::Metadata *ThreadYMDVals[] = {
- llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidy"),
- llvm::ConstantAsMetadata::get(
- llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
- llvm::Metadata *ThreadZMDVals[] = {
- llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidz"),
- llvm::ConstantAsMetadata::get(
- llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
-
+static void addKernelAttrs(Function *F) {
F->addFnAttr("nvvm.maxclusterrank", "1");
+ F->addFnAttr("nvvm.maxntid", "1");
F->setCallingConv(CallingConv::PTX_Kernel);
-
- // Append metadata to nvvm.annotations.
- MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals));
- MD->addOper...
[truncated]
|
@llvm/pr-subscribers-llvm-analysis Author: Alex MacLean (AlexMaclean) ChangesReplace some more nvvm.annotations with function attributes, auto-upgrading the annotations as needed. These new attributes will be more idiomatic and compile-time efficient than the annotations.
Patch is 46.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127736.diff 21 Files Affected:
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index f89d32d4e13fe..75c3387c88c4b 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -353,17 +353,13 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
int32_t *MaxThreadsVal,
int32_t *MinBlocksVal,
int32_t *MaxClusterRankVal) {
- // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
llvm::APSInt MaxThreads(32);
MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
if (MaxThreads > 0) {
if (MaxThreadsVal)
*MaxThreadsVal = MaxThreads.getExtValue();
- if (F) {
- // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
- NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
- MaxThreads.getExtValue());
- }
+ if (F)
+ F->addFnAttr("nvvm.maxntid", llvm::utostr(MaxThreads.getExtValue()));
}
// min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
diff --git a/clang/test/CodeGenCUDA/launch-bounds.cu b/clang/test/CodeGenCUDA/launch-bounds.cu
index 72f7857264f8c..fba66e85040c7 100644
--- a/clang/test/CodeGenCUDA/launch-bounds.cu
+++ b/clang/test/CodeGenCUDA/launch-bounds.cu
@@ -10,23 +10,30 @@
#endif
// CHECK: @Kernel1() #[[ATTR0:[0-9]+]]
+// CHECK: @Kernel2() #[[ATTR1:[0-9]+]]
+// CHECK: @{{.*}}Kernel3{{.*}}() #[[ATTR1]]
// CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]]
-// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]]
-// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]]
-// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]]
-
-// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
-// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}}
-// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}}
-
-// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]]
-
-// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}}
-// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}}
-// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}}
+// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR2:[0-9]+]]
+// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR3:[0-9]+]]
+// CHECK: @{{.*}}Kernel7{{.*}}() #[[ATTR1]]
+// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR4:[0-9]+]]
+
+// CHECK-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
+
+// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR0:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR0]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR1:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel7_sm_90{{.*}} #[[ATTR2:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR3:[0-9]+]]
+
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
// Test both max threads per block and Min cta per sm.
extern "C" {
@@ -37,8 +44,6 @@ Kernel1()
}
}
-// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
-
#ifdef USE_MAX_BLOCKS
// Test max threads per block and min/max cta per sm.
extern "C" {
@@ -48,8 +53,6 @@ Kernel1_sm_90()
{
}
}
-
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
#endif // USE_MAX_BLOCKS
// Test only max threads per block. Min cta per sm defaults to 0, and
@@ -62,8 +65,6 @@ Kernel2()
}
}
-// CHECK: !{{[0-9]+}} = !{ptr @Kernel2, !"maxntidx", i32 256}
-
template <int max_threads_per_block>
__global__ void
__launch_bounds__(max_threads_per_block)
@@ -72,7 +73,6 @@ Kernel3()
}
template __global__ void Kernel3<MAX_THREADS_PER_BLOCK>();
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}
template <int max_threads_per_block, int min_blocks_per_mp>
__global__ void
@@ -82,7 +82,6 @@ Kernel4()
}
template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
#ifdef USE_MAX_BLOCKS
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@@ -93,7 +92,6 @@ Kernel4_sm_90()
}
template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
#endif //USE_MAX_BLOCKS
const int constint = 100;
@@ -106,8 +104,6 @@ Kernel5()
}
template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
-
#ifdef USE_MAX_BLOCKS
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@@ -120,7 +116,6 @@ Kernel5_sm_90()
}
template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
#endif //USE_MAX_BLOCKS
// Make sure we don't emit negative launch bounds values.
@@ -129,15 +124,12 @@ __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
Kernel6()
{
}
-// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx",
__global__ void
__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
Kernel7()
{
}
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx",
-// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm",
#ifdef USE_MAX_BLOCKS
__global__ void
@@ -145,17 +137,12 @@ __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP
Kernel7_sm_90()
{
}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx",
-// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm",
-// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank",
#endif // USE_MAX_BLOCKS
const char constchar = 12;
__global__ void __launch_bounds__(constint, constchar) Kernel8() {}
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
#ifdef USE_MAX_BLOCKS
const char constchar_2 = 14;
__global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
#endif // USE_MAX_BLOCKS
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 7eacc58549c7d..73560ae84f2f4 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -74,6 +74,29 @@ Function Attributes
This attribute indicates the maximum number of registers to be used for the
kernel function.
+``"nvvm.maxntid"="<x>"``
+``"nvvm.maxntid"="<x>,<y>"``
+``"nvvm.maxntid"="<x>,<y>,<z>"``
+ This attribute declares the maximum number of threads in the thread block
+ (CTA). The maximum number of threads is the product of the maximum extent in
+ each dimension. Exceeding the maximum number of threads results in a runtime
+ error or kernel launch failure.
+
+``"nvvm.reqntid"="<x>"``
+``"nvvm.reqntid"="<x>,<y>"``
+``"nvvm.reqntid"="<x>,<y>,<z>"``
+ This attribute declares the exact number of threads in the thread block
+ (CTA). The number of threads is the product of the value in each dimension.
+ Specifying a different CTA dimension at launch will result in a runtime
+ error or kernel launch failure.
+
+``"nvvm.cluster_dim"="<x>"``
+``"nvvm.cluster_dim"="<x>,<y>"``
+``"nvvm.cluster_dim"="<x>,<y>,<z>"``
+ This attribute declares the number of thread blocks (CTAs) in the cluster.
+ The total number of CTAs is the product of the number of CTAs in each
+ dimension. Specifying a different cluster dimension at launch will result in
+ a runtime error or kernel launch failure. Only supported for Hopper+.
.. _address_spaces:
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 04acab1e5765e..c2451747077e5 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6366,45 +6366,13 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
KernelEnvironmentGV->setInitializer(NewInitializer);
}
-static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) {
- Module &M = *Kernel.getParent();
- NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
- for (auto *Op : MD->operands()) {
- if (Op->getNumOperands() != 3)
- continue;
- auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
- if (!KernelOp || KernelOp->getValue() != &Kernel)
- continue;
- auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
- if (!Prop || Prop->getString() != Name)
- continue;
- return Op;
- }
- return nullptr;
-}
-
-static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value,
- bool Min) {
- // Update the "maxntidx" metadata for NVIDIA, or add it.
- MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
- if (ExistingOp) {
- auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
- int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
- ExistingOp->replaceOperandWith(
- 2, ConstantAsMetadata::get(ConstantInt::get(
- OldVal->getValue()->getType(),
- Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
- } else {
- LLVMContext &Ctx = Kernel.getContext();
- Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel),
- MDString::get(Ctx, Name),
- ConstantAsMetadata::get(
- ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
- // Append metadata to nvvm.annotations
- Module &M = *Kernel.getParent();
- NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
- MD->addOperand(MDNode::get(Ctx, MDVals));
+static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
+ bool Min) {
+ if (Kernel.hasFnAttribute(Name)) {
+ int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
+ Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
}
+ Kernel.addFnAttr(Name, llvm::utostr(Value));
}
std::pair<int32_t, int32_t>
@@ -6426,9 +6394,8 @@ OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
return {LB, UB};
}
- if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
- auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
- int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
+ if (Kernel.hasFnAttribute("nvvm.maxntid")) {
+ int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
}
return {0, ThreadLimit};
@@ -6445,7 +6412,7 @@ void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
return;
}
- updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
+ updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
}
std::pair<int32_t, int32_t>
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 57072715366c9..dc18ba9780fec 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -13,11 +13,13 @@
//===----------------------------------------------------------------------===//
#include "llvm/IR/AutoUpgrade.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfo.h"
@@ -46,6 +48,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
+#include <cstdint>
#include <cstring>
#include <numeric>
@@ -5021,6 +5024,36 @@ bool llvm::UpgradeDebugInfo(Module &M) {
return Modified;
}
+static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC,
+ GlobalValue *GV, const Metadata *V) {
+ Function *F = cast<Function>(GV);
+
+ constexpr StringLiteral DefaultValue = "1";
+ StringRef Vect3[3] = {DefaultValue, DefaultValue, DefaultValue};
+ unsigned Length = 0;
+
+ if (F->hasFnAttribute(Attr)) {
+ StringRef S = F->getFnAttribute(Attr).getValueAsString();
+ for (; Length < 3 && !S.empty(); Length++) {
+ auto [Part, Rest] = S.split(',');
+ Vect3[Length] = Part.trim();
+ S = Rest;
+ }
+ }
+
+ const uint64_t VInt = mdconst::extract<ConstantInt>(V)->getZExtValue();
+ const std::string VStr = llvm::utostr(VInt);
+
+ const unsigned Dim = DimC - 'x';
+ assert(Dim >= 0 && Dim < 3 && "Unexpected dim char");
+
+ Vect3[Dim] = VStr;
+ Length = std::max(Length, Dim + 1);
+
+ const std::string NewAttr = llvm::join(ArrayRef(Vect3, Length), ",");
+ F->addFnAttr(Attr, NewAttr);
+}
+
bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
const Metadata *V) {
if (K == "kernel") {
@@ -5059,6 +5092,18 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
cast<Function>(GV)->addFnAttr("nvvm.maxnreg", llvm::utostr(CV));
return true;
}
+ if (K.consume_front("maxntid") && (K == "x" || K == "y" || K == "z")) {
+ upgradeNVVMFnVectorAttr("nvvm.maxntid", K[0], GV, V);
+ return true;
+ }
+ if (K.consume_front("reqntid") && (K == "x" || K == "y" || K == "z")) {
+ upgradeNVVMFnVectorAttr("nvvm.reqntid", K[0], GV, V);
+ return true;
+ }
+ if (K.consume_front("cluster_dim_") && (K == "x" || K == "y" || K == "z")) {
+ upgradeNVVMFnVectorAttr("nvvm.cluster_dim", K[0], GV, V);
+ return true;
+ }
return false;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 75d930d9f7b6f..0c172bd2d6a7e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -27,6 +27,7 @@
#include "cl_common_defines.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallString.h"
@@ -34,6 +35,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -505,24 +507,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
// If the NVVM IR has some of reqntid* specified, then output
// the reqntid directive, and set the unspecified ones to 1.
// If none of Reqntid* is specified, don't output reqntid directive.
- std::optional<unsigned> Reqntidx = getReqNTIDx(F);
- std::optional<unsigned> Reqntidy = getReqNTIDy(F);
- std::optional<unsigned> Reqntidz = getReqNTIDz(F);
+ const auto ReqNTID = getReqNTID(F);
+ if (!ReqNTID.empty())
+ O << formatv(".reqntid {0:$[, ]}\n",
+ make_range(ReqNTID.begin(), ReqNTID.end()));
- if (Reqntidx || Reqntidy || Reqntidz)
- O << ".reqntid " << Reqntidx.value_or(1) << ", " << Reqntidy.value_or(1)
- << ", " << Reqntidz.value_or(1) << "\n";
-
- // If the NVVM IR has some of maxntid* specified, then output
- // the maxntid directive, and set the unspecified ones to 1.
- // If none of maxntid* is specified, don't output maxntid directive.
- std::optional<unsigned> Maxntidx = getMaxNTIDx(F);
- std::optional<unsigned> Maxntidy = getMaxNTIDy(F);
- std::optional<unsigned> Maxntidz = getMaxNTIDz(F);
-
- if (Maxntidx || Maxntidy || Maxntidz)
- O << ".maxntid " << Maxntidx.value_or(1) << ", " << Maxntidy.value_or(1)
- << ", " << Maxntidz.value_or(1) << "\n";
+ const auto MaxNTID = getMaxNTID(F);
+ if (!MaxNTID.empty())
+ O << formatv(".maxntid {0:$[, ]}\n",
+ make_range(MaxNTID.begin(), MaxNTID.end()));
if (const auto Mincta = getMinCTASm(F))
O << ".minnctapersm " << *Mincta << "\n";
@@ -536,21 +529,19 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
if (STI->getSmVersion() >= 90) {
- std::optional<unsigned> ClusterX = getClusterDimx(F);
- std::optional<unsigned> ClusterY = getClusterDimy(F);
- std::optional<unsigned> ClusterZ = getClusterDimz(F);
+ const auto ClusterDim = getClusterDim(F);
- if (ClusterX || ClusterY || ClusterZ) {
+ if (!ClusterDim.empty()) {
O << ".explicitcluster\n";
- if (ClusterX.value_or(1) != 0) {
- assert(ClusterY.value_or(1) && ClusterZ.value_or(1) &&
+ if (ClusterDim[0] != 0) {
+ assert(llvm::all_of(ClusterDim, [](unsigned D) { return D != 0; }) &&
"cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z "
"should be non-zero as well");
- O << ".reqnctapercluster " << ClusterX.value_or(1) << ", "
- << ClusterY.value_or(1) << ", " << ClusterZ.value_or(1) << "\n";
+ O << formatv(".reqnctapercluster {0:$[, ]}\n",
+ make_range(ClusterDim.begin(), ClusterDim.end()));
} else {
- assert(!ClusterY.value_or(1) && !ClusterZ.value_or(1) &&
+ assert(llvm::all_of(ClusterDim, [](unsigned D) { return D == 0; }) &&
"cluster_dim_x == 0 implies cluster_dim_y and cluster_dim_z "
"should be 0 as well");
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
index ae5922cba4ce3..b10e0b14118a1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
@@ -50,33 +50,10 @@ static std::string getHash(StringRef Str) {
return llvm::utohexstr(Hash.low(), /*LowerCase=*/true);
}
-static void addKernelMetadata(Module &M, Function *F) {
- llvm::LLVMContext &Ctx = M.getContext();
-
- // Get "nvvm.annotations" metadata node.
- llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
-
- // This kernel is only to be called single-threaded.
- llvm::Metadata *ThreadXMDVals[] = {
- llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidx"),
- llvm::ConstantAsMetadata::get(
- llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
- llvm::Metadata *ThreadYMDVals[] = {
- llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidy"),
- llvm::ConstantAsMetadata::get(
- llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
- llvm::Metadata *ThreadZMDVals[] = {
- llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidz"),
- llvm::ConstantAsMetadata::get(
- llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
-
+static void addKernelAttrs(Function *F) {
F->addFnAttr("nvvm.maxclusterrank", "1");
+ F->addFnAttr("nvvm.maxntid", "1");
F->setCallingConv(CallingConv::PTX_Kernel);
-
- // Append metadata to nvvm.annotations.
- MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals));
- MD->addOper...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
fe470ba
to
bae09a6
Compare
bae09a6
to
fd8f342
Compare
llvm/lib/IR/AutoUpgrade.cpp
Outdated
const uint64_t VInt = mdconst::extract<ConstantInt>(V)->getZExtValue(); | ||
const std::string VStr = llvm::utostr(VInt); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd move these down to where we set Vect3[Dim] = VStr;
. Maybe drop VStr
and assign directly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We cannot drop VStr and assign directly because Vect3 is an array of StringRef, VStr must exist on the stack to provide a location for the string.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Github unhelpfully does not show this comment in the diff, so I almost missed it. :-/
Then I'd make Dim3 a SmallVector of SmallString so we can't accidentally shoot ourselves in the foot.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The current implementation avoid the need make an additional copy of the strings from the original attr as with StringRefs we can just point to the original data. If we move to SmallString this won't be the case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair enough. Just add a comment that we do need a local variable for the reference to point somewhere.
Most likely compiler would do good enough job eliminating unnecessary stuff, and this code is not anywhere near hot path in any case.
const auto ReqNTID = getReqNTID(F); | ||
if (!ReqNTID.empty()) | ||
O << formatv(".reqntid {0:$[, ]}\n", | ||
make_range(ReqNTID.begin(), ReqNTID.end())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getReqNTID()
already returns a small vector which should be accepted as a range input. Do we really need an explicit range creation here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree it seems like it should not be necessary, but this was the only way I could get it to compile. If you know a cleaner/more idiomatic way to do this that would be great.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like make_range()
is all we have now. That's what formatv() tests use, too:
llvm-project/llvm/unittests/Support/FormatVariadicTest.cpp
Lines 607 to 612 in f62f13d
TEST(FormatVariadicTest, Range) { | |
std::vector<int> IntRange = {1, 1, 2, 3, 5, 8, 13}; | |
// 1. Simple range with default separator and element style. | |
EXPECT_EQ("1, 1, 2, 3, 5, 8, 13", | |
formatv("{0}", make_range(IntRange.begin(), IntRange.end())).str()); |
const std::string VStr = llvm::utostr(VInt); | ||
Vect3[Dim] = VStr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just Vect3[Dim] = llvm::utostr(VInt);
const auto ReqNTID = getReqNTID(F); | ||
if (!ReqNTID.empty()) | ||
O << formatv(".reqntid {0:$[, ]}\n", | ||
make_range(ReqNTID.begin(), ReqNTID.end())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like make_range()
is all we have now. That's what formatv() tests use, too:
llvm-project/llvm/unittests/Support/FormatVariadicTest.cpp
Lines 607 to 612 in f62f13d
TEST(FormatVariadicTest, Range) { | |
std::vector<int> IntRange = {1, 1, 2, 3, 5, 8, 13}; | |
// 1. Simple range with default separator and element style. | |
EXPECT_EQ("1, 1, 2, 3, 5, 8, 13", | |
formatv("{0}", make_range(IntRange.begin(), IntRange.end())).str()); |
Replace some more nvvm.annotations with function attributes, auto-upgrading the annotations as needed. These new attributes will be more idiomatic and compile-time efficient than the annotations.