Relax FP8 gated activations requirements

Expanded MXFP8 and FP8 tests coverage Signed-off-by: Przemek Tredak <[email protected]>
NVIDIA · Feb 1, 2025 · 0e1ea26 · 0e1ea26
1 parent f5f2872
commit 0e1ea26
Show file tree

Hide file tree

Showing 12 changed files with 440 additions and 324 deletions.
diff --git a/tests/cpp/operator/test_act.cu b/tests/cpp/operator/test_act.cu
@@ -194,8 +194,12 @@ void performTestGLU(const size_t N, const size_t H) {
   ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
 
   if (otype == DType::kFloat8E4M3 || otype == DType::kFloat8E5M2) {
-    auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
-    compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
+    auto [atol, rtol] = getTolerances(DType::kFloat32);
+    compareResults("amax", output.amax(), ref_amax, atol, rtol);
+    if (output.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) {
+      const float ref_scale = 1.f; // assuming input scale is 1.f
+      compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr<float>(), ref_scale, atol, rtol);
+    }
   }
   auto [atol, rtol] = getTolerances(otype);
   compareResults("output_gelu", output, ref_output.get(), atol, rtol);

diff --git a/tests/cpp/operator/test_cast.cu b/tests/cpp/operator/test_cast.cu
@@ -23,31 +23,31 @@ namespace {
 
 template <typename InputType, typename OutputType>
 void compute_ref(const InputType *data, OutputType *output_c,
-                 const size_t N, const size_t H,
+                 const size_t size,
                  float *amax, float scale) {
   using compute_t = float;
   compute_t current_max = -1e100;
-  for (size_t i = 0; i < N; ++i) {
-    for (size_t j = 0; j < H; ++j) {
-      compute_t current = static_cast<compute_t>(data[i * H + j]);
+  for (size_t i = 0; i < size; ++i) {
+      compute_t current = static_cast<compute_t>(data[i]);
       current_max = fmaxf(current_max, fabsf(current));
-      output_c[i * H + j] = OutputType(scale * current);
-    }
+      output_c[i] = OutputType(scale * current);
   }
   *amax = current_max;
 }
 
 template <typename InputType, typename OutputType>
-void performTest(const size_t N, const size_t H) {
+void performTest(const std::vector<size_t>& shape) {
   using namespace test;
 
+  const size_t full_size = product(shape);
+
   DType itype = TypeInfo<InputType>::dtype;
   DType otype = TypeInfo<OutputType>::dtype;
 
-  Tensor input({ N, H }, itype);
-  Tensor output_c({ N, H }, otype);
+  Tensor input(shape, itype);
+  Tensor output_c(shape, otype);
 
-  std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(N * H);
+  std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(full_size);
 
   fillUniform(&input);
   setRandomScale(&output_c);
@@ -56,7 +56,7 @@ void performTest(const size_t N, const size_t H) {
 
   float ref_amax;
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                     N, H, &ref_amax, output_c.scale());
+                                     full_size, &ref_amax, output_c.scale());
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -71,27 +71,29 @@ void performTest(const size_t N, const size_t H) {
   compareResults("output_c", output_c, ref_output_c.get(), true, atol, rtol);
 }
 
-std::vector<std::pair<size_t, size_t>> test_cases = {
+std::vector<std::vector<size_t>> test_cases = {
+  {16},
+  {16000},
   {128, 128},
   {256, 256},
   {768, 1024},
   {256, 65536},
   {2048, 12288},
   {65536, 128},
   {65536, 160},
-  {16384, 6144},
   {16384, 1616},
   {1, 128},
   {1, 1296},
   {1, 16},
   {5, 160},
+  {5, 4, 3, 160},
   {217, 256},
 };
 }  // namespace
 
 class CastTestSuite : public ::testing::TestWithParam<std::tuple<transformer_engine::DType,
                                                                  transformer_engine::DType,
-                                                                 std::pair<size_t, size_t>>> {};
+                                                                 std::vector<size_t>>> {};
 
 TEST_P(CastTestSuite, TestCast) {
   using namespace transformer_engine;
@@ -103,7 +105,7 @@ TEST_P(CastTestSuite, TestCast) {
 
   TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(input_type, InputType,
     TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(output_type, OutputType,
-      performTest<InputType, OutputType>(size.first, size.second);
+      performTest<InputType, OutputType>(size);
     );
   );
 }
@@ -119,8 +121,10 @@ INSTANTIATE_TEST_SUITE_P(
       ::testing::ValuesIn(test_cases)),
   [](const testing::TestParamInfo<CastTestSuite::ParamType>& info) {
     std::string name = test::typeName(std::get<0>(info.param)) + "X" +
-                       test::typeName(std::get<1>(info.param)) + "X" +
-                       std::to_string(std::get<2>(info.param).first) + "X" +
-                       std::to_string(std::get<2>(info.param).second);
+                       test::typeName(std::get<1>(info.param));
+    const auto& shape = std::get<2>(info.param);
+    for ( const auto& s: shape) {
+      name += "X" + std::to_string(s);
+    }
     return name;
   });
diff --git a/tests/cpp/operator/test_cast_dbias.cu b/tests/cpp/operator/test_cast_dbias.cu
@@ -56,16 +56,19 @@ void compute_ref_cast_dbias(const IT *input_h,
 }
 
 template <typename IType, typename OType>
-void performTest(const size_t N, const size_t H) {
+void performTest(const std::vector<size_t>& shape) {
   using namespace test;
   using CType = fp32;
 
   DType itype = TypeInfo<IType>::dtype;
   DType otype = TypeInfo<OType>::dtype;
 
-  Tensor input({N, H}, itype);
+  const size_t N = first_dimension(shape);
+  const size_t H = last_dimension(shape);
 
-  Tensor output_c({N, H}, otype);
+  Tensor input(shape, itype);
+
+  Tensor output_c(shape, otype);
   // dbias has the same data type with "output grad"
   Tensor dbias({H}, itype);
 
@@ -117,20 +120,20 @@ void performTest(const size_t N, const size_t H) {
   compareResults("output_dbias", dbias, ref_output_dbias.get(), true, atol_dbias, rtol_dbias);
 }
 
-std::vector<std::pair<size_t, size_t>> test_cases = {
+std::vector<std::vector<size_t>> test_cases = {
   {128, 128},
   {256, 256},
   {768, 1024},
   {256, 65536},
   {2048, 12288},
   {65536, 128},
   {65536, 160},
-  {16384, 6144},
   {16384, 1616},
   {1, 128},
   {1, 1296},
   {1, 16},
   {5, 160},
+  {5, 4, 3, 160},
   {217, 256},
 };
 
@@ -139,7 +142,7 @@ std::vector<std::pair<size_t, size_t>> test_cases = {
 
 class CastDBiasTestSuite : public ::testing::TestWithParam<std::tuple<transformer_engine::DType,
                                                                       transformer_engine::DType,
-                                                                      std::pair<size_t, size_t>>> {};
+                                                                      std::vector<size_t>>> {};
 
 TEST_P(CastDBiasTestSuite, TestCastDBias) {
     using namespace transformer_engine;
@@ -155,7 +158,7 @@ TEST_P(CastDBiasTestSuite, TestCastDBias) {
 
     TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(input_type, InputType,
       TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(output_type, OutputType,
-        performTest<InputType, OutputType>(size.first, size.second);
+        performTest<InputType, OutputType>(size);
       );
     );
 }
@@ -169,8 +172,10 @@ INSTANTIATE_TEST_SUITE_P(
         ::testing::ValuesIn(test_cases)),
     [](const testing::TestParamInfo<CastDBiasTestSuite::ParamType>& info) {
       std::string name = test::typeName(std::get<0>(info.param)) + "X" +
-                         test::typeName(std::get<1>(info.param)) + "X" +
-                         std::to_string(std::get<2>(info.param).first) + "X" +
-                         std::to_string(std::get<2>(info.param).second);
+      test::typeName(std::get<1>(info.param));
+      const auto& shape = std::get<2>(info.param);
+      for ( const auto& s: shape) {
+        name += "X" + std::to_string(s);
+      }
       return name;
     });
diff --git a/tests/cpp/operator/test_cast_dbias_dgelu.cu b/tests/cpp/operator/test_cast_dbias_dgelu.cu
@@ -64,17 +64,20 @@ void compute_ref_cast_dbias_dgelu(const IT *input,
 }
 
 template <typename IType, typename OType>
-void performTest(const size_t N, const size_t H) {
+void performTest(const std::vector<size_t>& shape) {
   using namespace test;
   using CType = fp32;
 
   DType itype = TypeInfo<IType>::dtype;
   DType otype = TypeInfo<OType>::dtype;
 
-  Tensor input({N, H}, itype);
-  Tensor gelu_input({N, H}, itype);
+  const size_t N = first_dimension(shape);
+  const size_t H = last_dimension(shape);
 
-  Tensor output_c({N, H}, otype);
+  Tensor input(shape, itype);
+  Tensor gelu_input(shape, itype);
+
+  Tensor output_c(shape, otype);
   // dbias has the same data type with "output grad"
   Tensor dbias({H}, itype);
 
@@ -132,20 +135,20 @@ void performTest(const size_t N, const size_t H) {
   compareResults("output_dbias", dbias, ref_output_dbias.get(), true, atol_dbias, rtol_dbias);
 }
 
-std::vector<std::pair<size_t, size_t>> test_cases = {
+std::vector<std::vector<size_t>> test_cases = {
   {128, 128},
   {256, 256},
   {768, 1024},
   {256, 65536},
   {2048, 12288},
   {65536, 128},
   {65536, 160},
-  {16384, 6144},
   {16384, 1616},
   {1, 128},
   {1, 1296},
   {1, 16},
   {5, 160},
+  {5, 4, 3, 160},
   {217, 256},
 };
 
@@ -154,7 +157,7 @@ std::vector<std::pair<size_t, size_t>> test_cases = {
 
 class CastDBiasDGeluTestSuite : public ::testing::TestWithParam<std::tuple<transformer_engine::DType,
                                                                            transformer_engine::DType,
-                                                                           std::pair<size_t, size_t>>> {};
+                                                                           std::vector<size_t>>> {};
 
 TEST_P(CastDBiasDGeluTestSuite, TestCastDBiasDgelu) {
     using namespace transformer_engine;
@@ -170,7 +173,7 @@ TEST_P(CastDBiasDGeluTestSuite, TestCastDBiasDgelu) {
 
     TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(input_type, InputType,
       TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(output_type, OutputType,
-        performTest<InputType, OutputType>(size.first, size.second);
+        performTest<InputType, OutputType>(size);
       );
     );
 }
@@ -184,8 +187,10 @@ INSTANTIATE_TEST_SUITE_P(
         ::testing::ValuesIn(test_cases)),
     [](const testing::TestParamInfo<CastDBiasDGeluTestSuite::ParamType>& info) {
       std::string name = test::typeName(std::get<0>(info.param)) + "X" +
-                         test::typeName(std::get<1>(info.param)) + "X" +
-                         std::to_string(std::get<2>(info.param).first) + "X" +
-                         std::to_string(std::get<2>(info.param).second);
+                         test::typeName(std::get<1>(info.param));
+      const auto& shape = std::get<2>(info.param);
+      for ( const auto& s: shape) {
+        name += "X" + std::to_string(s);
+      }
       return name;
     });
diff --git a/tests/cpp/operator/test_cast_gated_swiglu.cu b/tests/cpp/operator/test_cast_gated_swiglu.cu
@@ -58,21 +58,29 @@ void compute_ref_cast_dgated_swiglu(const IType * const grad,
 }
 
 template <typename IType, typename OType>
-void performTest(const size_t rows, const size_t cols) {
+void performTest(const std::vector<size_t>& shape) {
   using namespace test;
 
   DType itype = TypeInfo<IType>::dtype;
   DType otype = TypeInfo<OType>::dtype;
 
-  Tensor grad({rows, cols}, itype);
-  Tensor input({rows, cols * 2}, itype);
-  Tensor output_c({rows, cols * 2}, otype);
+  std::vector<size_t> input_shape = shape;
+  input_shape[input_shape.size() - 1] *= 2;
+
+  const size_t input_size = product(input_shape);
+
+  const size_t rows = first_dimension(shape);
+  const size_t cols = last_dimension(shape);
+
+  Tensor grad(shape, itype);
+  Tensor input(input_shape, itype);
+  Tensor output_c(input_shape, otype);
 
   fillUniform(&grad);
   fillUniform(&input);
   setRandomScale(&output_c);
 
-  std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(rows * cols * 2);
+  std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(input_size);
 
   nvte_dswiglu(grad.data(), input.data(), output_c.data(), 0);
   cudaDeviceSynchronize();
@@ -100,21 +108,28 @@ void performTest(const size_t rows, const size_t cols) {
   compareResults("output_c", output_c, ref_output_c.get(), true, atol, rtol);
 }
 
-std::vector<std::pair<size_t, size_t>> test_cases = {
+std::vector<std::vector<size_t>> test_cases = {
   {128, 128},
   {256, 256},
   {768, 1024},
-  // {256, 65536},
-  // {2048, 12288},
-  // {65536, 128},
-  // {16384, 6144},
+  {256, 65536},
+  {2048, 12288},
+  {65536, 128},
+  {65536, 160},
+  {16384, 1616},
+  {1, 128},
+  {1, 1296},
+  {1, 16},
+  {5, 160},
+  {5, 4, 3, 160},
+  {217, 256},
 };
 
 }  // namespace
 
 class CastSwiGLUTestSuite
     : public ::testing::TestWithParam<std::tuple<
-          transformer_engine::DType, transformer_engine::DType, std::pair<size_t, size_t>>> {};
+          transformer_engine::DType, transformer_engine::DType, std::vector<size_t>>> {};
 
 TEST_P(CastSwiGLUTestSuite, TestCastSwiGLU) {
   using namespace transformer_engine;
@@ -131,7 +146,7 @@ TEST_P(CastSwiGLUTestSuite, TestCastSwiGLU) {
   TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(
       input_type, InputType,
       TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(
-          output_type, OutputType, performTest<InputType, OutputType>(size.first, size.second);););
+          output_type, OutputType, performTest<InputType, OutputType>(size);););
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -142,8 +157,10 @@ INSTANTIATE_TEST_SUITE_P(
         ::testing::ValuesIn(test_cases)),
     [](const testing::TestParamInfo<CastSwiGLUTestSuite::ParamType> &info) {
       std::string name = test::typeName(std::get<0>(info.param)) + "X" +
-                         test::typeName(std::get<1>(info.param)) + "X" +
-                         std::to_string(std::get<2>(info.param).first) + "X" +
-                         std::to_string(std::get<2>(info.param).second);
+                         test::typeName(std::get<1>(info.param));
+      const auto& shape = std::get<2>(info.param);
+      for ( const auto& s: shape) {
+        name += "X" + std::to_string(s);
+      }
       return name;
     });