ROCm · wangye805 · Apr 22, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/transformer_engine/common/rocshmem_api/rocshmem_waitkernel.hip b/transformer_engine/common/rocshmem_api/rocshmem_waitkernel.hip
@@ -13,6 +13,7 @@
 #include "math.h"
 #include "ptx.cuh"
 #include "rocm_vectorized_2d.cuh"
+#include "tdm.cuh"
 #include "transformer_engine/activation.h"
 #include "transformer_engine/cast.h"
 #include "vectorized_pointwise.h"
@@ -134,6 +135,28 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
     const size_t row_base = chunk_it_offset_y; 
 
     // Initiate bulk tensor copy
+#if defined(__gfx1250__)
+    {
+      constexpr uint32_t data_sz = tdm::get_data_size_from_bits(sizeof(IType) * 8);
+      if constexpr (IS_DGATED) {
+        // grad uses stride=cols, act/gate use stride=2*cols -- issue separately
+        tdm::copy_2d_to_shared(
+            &in_grad_sh[0], grad_ptr, chunk_it_offset_x, chunk_it_offset_y,
+            SHMEM_DIM_X, SHMEM_DIM_Y, cols, rows, cols, data_sz);
+        tdm::copy_2d_to_shared_x2(
+            &in_act_sh[0], input_act, chunk_it_offset_x, chunk_it_offset_y,
+            &in_gate_sh[0], input_gate, chunk_it_offset_x, chunk_it_offset_y,
+            SHMEM_DIM_X, SHMEM_DIM_Y, cols, rows, 2*cols, data_sz);
+      } else {
+        tdm::copy_2d_to_shared_x2(
+            &in_act_sh[0], input_act, chunk_it_offset_x, chunk_it_offset_y,
+            &in_gate_sh[0], input_gate, chunk_it_offset_x, chunk_it_offset_y,
+            SHMEM_DIM_X, SHMEM_DIM_Y, cols, rows, 2*cols, data_sz);
+      }
+      tdm::wait_tensorcnt_0();
+      __syncthreads();
+    }
+#else
     if constexpr (IS_DGATED) {
       copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&in_grad_sh[0], grad_ptr, chunk_it_offset_x, chunk_it_offset_y,
                         cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
@@ -142,12 +165,13 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
     // Act
     copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&in_act_sh[0], input_act, chunk_it_offset_x, chunk_it_offset_y,
                       2*cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
-    
+
     // Gate
     copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&in_gate_sh[0], input_gate, chunk_it_offset_x, chunk_it_offset_y,
                       2*cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
 
     __syncthreads();
+#endif
 
     const int iteration_scale_colwise_offset_Y = scales_colwise_chunk_offset_Y + it;
     const int iteration_scale_rowwise_offset_Y = scales_rowwise_chunk_offset_Y + it * BUFFER_DIM_Y;
@@ -353,6 +377,33 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
 
     __syncthreads();
 
+#if defined(__gfx1250__)
+    {
+      constexpr uint32_t out_data_sz = tdm::get_data_size_from_bits(sizeof(OType) * 8);
+      if constexpr (USE_ROWWISE_SCALING) {
+        tdm::store_2d_to_global(&out_act_rowwise_sh[0], output_act_rowwise,
+                                chunk_it_offset_x, chunk_it_offset_y,
+                                SHMEM_DIM_X, SHMEM_DIM_Y, cols, rows, output_cols, out_data_sz);
+        if constexpr (IS_DGATED) {
+          tdm::store_2d_to_global(&out_gate_rowwise_sh[0], output_gate_rowwise,
+                                  chunk_it_offset_x, chunk_it_offset_y,
+                                  SHMEM_DIM_X, SHMEM_DIM_Y, cols, rows, output_cols, out_data_sz);
+        }
+      }
+      if constexpr (USE_COLWISE_SCALING) {
+        tdm::store_2d_to_global(&out_act_colwise_sh[0], output_act_colwise,
+                                chunk_it_offset_x, chunk_it_offset_y,
+                                SHMEM_DIM_X, SHMEM_DIM_Y, cols, rows, output_cols, out_data_sz);
+        if constexpr (IS_DGATED) {
+          tdm::store_2d_to_global(&out_gate_colwise_sh[0], output_gate_colwise,
+                                  chunk_it_offset_x, chunk_it_offset_y,
+                                  SHMEM_DIM_X, SHMEM_DIM_Y, cols, rows, output_cols, out_data_sz);
+        }
+      }
+      tdm::wait_tensorcnt_0();
+      __syncthreads();
+    }
+#else
     if constexpr (USE_ROWWISE_SCALING) {
       bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, IS_ALIGNED>(&out_act_rowwise_sh[0], output_act_rowwise, chunk_it_offset_x,
                                       chunk_it_offset_y, output_cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
@@ -361,7 +412,7 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
                                       chunk_it_offset_y, output_cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
       }
     }
-    
+
     if constexpr (USE_COLWISE_SCALING) {
       bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, IS_ALIGNED>(&out_act_colwise_sh[0], output_act_colwise, chunk_it_offset_x,
                                       chunk_it_offset_y, output_cols, SHMEM_DIM_Y, SHMEM_DIM_X, rows, cols);
@@ -371,6 +422,7 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
       }
     }
     __syncthreads();
+#endif
   }
 }
 } // namespace gated_kernels

@@ -12,6 +12,7 @@
 #include "math.h"
 #include "ptx.cuh"
 #include "rocm_vectorized_2d.cuh"
+#include "tdm.cuh"
 #include "transformer_engine/cast.h"
 #include "../transpose/cast_transpose.h"
 #include "vectorized_pointwise.h"
@@ -161,15 +162,31 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
       const int chunk_it_offset_y = chunk_offset_Y + iter * MXFP8_BUFFER_DIM_Y;
       const int chunk_it_offset_x = chunk_offset_X;
       const size_t row_base = chunk_it_offset_y;
+#if defined(__gfx1250__)
+      constexpr uint32_t data_sz = tdm::get_data_size_from_bits(sizeof(IType) * 8);
       if constexpr (IS_DACT) {
-        copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&act_in_sh[0][0], act_input_ptr, 
-                          chunk_it_offset_x, chunk_it_offset_y, cols, 
+        tdm::copy_2d_to_shared_x2(
+            &in_sh[0][0], input_ptr, chunk_it_offset_x, chunk_it_offset_y,
+            &act_in_sh[0][0], act_input_ptr, chunk_it_offset_x, chunk_it_offset_y,
+            MXFP8_SHMEM_DIM_X, MXFP8_SHMEM_DIM_Y, cols, rows, cols, data_sz);
+      } else {
+        tdm::copy_2d_to_shared(
+            &in_sh[0][0], input_ptr, chunk_it_offset_x, chunk_it_offset_y,
+            MXFP8_SHMEM_DIM_X, MXFP8_SHMEM_DIM_Y, cols, rows, cols, data_sz);
+      }
+      tdm::wait_tensorcnt_0();
+      __syncthreads();
+#else
+      if constexpr (IS_DACT) {
+        copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&act_in_sh[0][0], act_input_ptr,
+                          chunk_it_offset_x, chunk_it_offset_y, cols,
                           MXFP8_SHMEM_DIM_Y, MXFP8_SHMEM_DIM_X, rows, cols);
       }
-      copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&in_sh[0][0], input_ptr, chunk_it_offset_x, 
+      copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&in_sh[0][0], input_ptr, chunk_it_offset_x,
                         chunk_it_offset_y, cols, MXFP8_SHMEM_DIM_Y,
                         MXFP8_SHMEM_DIM_X, rows, cols);
       __syncthreads();
+#endif
 
       if constexpr (USE_ROWWISE_SCALING) {
         Vec<IType, ELEMS_PER_THREAD> in;
@@ -312,6 +329,23 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
 
       __syncthreads();
 
+#if defined(__gfx1250__)
+      constexpr uint32_t out_data_sz = tdm::get_data_size_from_bits(sizeof(OType) * 8);
+      if constexpr (USE_ROWWISE_SCALING) {
+        tdm::store_2d_to_global(&out_rowwise_sh[0][0], output_rowwise,
+                                chunk_it_offset_x, chunk_it_offset_y,
+                                MXFP8_SHMEM_DIM_X, MXFP8_SHMEM_DIM_Y,
+                                cols, rows, cols, out_data_sz);
+      }
+      if constexpr (USE_COLWISE_SCALING) {
+        tdm::store_2d_to_global(&out_colwise_sh[0][0], output_colwise,
+                                chunk_it_offset_x, chunk_it_offset_y,
+                                MXFP8_SHMEM_DIM_X, MXFP8_SHMEM_DIM_Y,
+                                cols, rows, cols, out_data_sz);
+      }
+      tdm::wait_tensorcnt_0();
+      __syncthreads();
+#else
       if constexpr (USE_ROWWISE_SCALING) {
         bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, IS_ALIGNED>(&out_rowwise_sh[0][0], output_rowwise, chunk_it_offset_x,
                                         chunk_it_offset_y, cols, MXFP8_SHMEM_DIM_Y,
@@ -324,6 +358,7 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
       }
 
       __syncthreads();
+#endif
     }
   }
 

@@ -14,6 +14,7 @@
 #include "math.h"
 #include "ptx.cuh"
 #include "rocm_vectorized_2d.cuh"
+#include "tdm.cuh"
 #include "transformer_engine/activation.h"
 #include "transformer_engine/cast.h"
 #include "../transpose/cast_transpose.h"
@@ -85,10 +86,21 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
     const int chunk_it_offset_y = chunk_offset_Y + iter * BUFFER_DIM_Y;
     const int chunk_it_offset_x = chunk_offset_X;
 
-    copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&in_sh[0][0], input_ptr, chunk_it_offset_x, 
+#if defined(__gfx1250__)
+    {
+      constexpr uint32_t data_sz = tdm::get_data_size_from_bits(sizeof(IType) * 8);
+      tdm::copy_2d_to_shared(&in_sh[0][0], input_ptr,
+                             chunk_it_offset_x, chunk_it_offset_y,
+                             SHMEM_DIM_X, SHMEM_DIM_Y, cols, rows, cols, data_sz);
+      tdm::wait_tensorcnt_0();
+      __syncthreads();
+    }
+#else
+    copy_2d_to_shared<IType, VECTOR_WIDTH, IS_ALIGNED>(&in_sh[0][0], input_ptr, chunk_it_offset_x,
                       chunk_it_offset_y, cols, SHMEM_DIM_Y,
                       SHMEM_DIM_X, rows, cols);
     __syncthreads();
+#endif
 
     const int scale_offset_Y =
         USE_ROWWISE_SCALING ? (scales_rowwise_chunk_offset_Y + iter * BUFFER_DIM_Y + tid_rowwise_Y)
@@ -126,11 +138,22 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
 
     __syncthreads();
 
+#if defined(__gfx1250__)
+    {
+      constexpr uint32_t out_data_sz = tdm::get_data_size_from_bits(sizeof(OType) * 8);
+      tdm::store_2d_to_global(&out_sh[0][0], output_ptr,
+                              chunk_it_offset_x, chunk_it_offset_y,
+                              SHMEM_DIM_X, SHMEM_DIM_Y, cols, rows, cols, out_data_sz);
+      tdm::wait_tensorcnt_0();
+      __syncthreads();
+    }
+#else
     bulk_tensor_2d_shared_to_global<OType, VECTOR_WIDTH, IS_ALIGNED>(&out_sh[0][0], output_ptr, chunk_it_offset_x,
                                     chunk_it_offset_y, cols, SHMEM_DIM_Y,
                                     SHMEM_DIM_X, rows, cols);
 
     __syncthreads();
+#endif
   }
 }
 } // namespace dequantization