ROCm · VeeraRajasekhar · Apr 17, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/commit.txt b/commit.txt
@@ -0,0 +1,57 @@
+Fix cpplint violations in common and PyTorch extension code
+
+transformer_engine/common/amd_detail/hip_float8.h
+  -Host constructor: multi-statement if/else now uses braces (readability/braces).
+
+transformer_engine/common/cast/mxfp8/rocm_quantize_mxfp8.cuh
+  -Include <cstdint>; typedef for gfx950 vector type uses int16_t instead of
+  short (runtime/int).
+
+transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp
+  -dladdr: avoid ill-formed function-pointer-to-void* cast via a small union
+  (readability/casting / portable POSIX).
+  -get_ck_log_stream: else branch restructured with nested if so else/brace
+  pairing satisfies cpplint (readability/braces).
+
+transformer_engine/common/fused_attn_rocm/fused_attn.cpp
+  -check_set_window_size: replace std::make_pair<int64_t,int64_t>(...) with
+  std::pair<int64_t,int64_t>(...) (build/explicit_make_pair).
+  -Replace alternative tokens `or` with || (readability/alt_tokens).
+  -log_fused_attn_config: same for sliding-window condition.
+
+transformer_engine/common/gemm/rocm_gemm.cu
+  -ObjCache / NameMapper: mark single-argument constructors explicit
+  (runtime/explicit).
+  -HIPBLASLT scaling_mode check: split #if/#else branches so each if has its
+  own braced body; use static_cast<int> instead of C-style cast
+  (readability/braces, readability/casting).
+  -Debug logging: (int) casts -> static_cast<int> for hipDataType fields
+  (readability/casting).
+  -ServiceStreamKey: use std::uint64_t alias instead of unsigned long long
+  (runtime/int).
+
+transformer_engine/common/normalization/common.cpp
+  -getNormalizationPlan: after optional CUDNN plan, use if (!plan) { ... } for
+  TE plans instead of } else #endif if (readability/braces across preprocessor).
+
+transformer_engine/common/normalization/layernorm/ln_api.cpp
+  -Forward/backward: default norm_backend to Te; optional CUDNN path only under
+  #ifndef __HIP_PLATFORM_AMD__; set is_aligned only when backend is Te, so
+  preprocessor does not split if/else from its braces (readability/braces).
+
+transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp
+  -Same pattern as ln_api for forward (including HIP constexpr
+  gamma_in_weight_dtype) and backward cudnn vs Te (readability/braces).
+
+transformer_engine/common/permutation/permutation.cu
+  -MoE unpermute kernel: functional-style float(...) casts replaced with
+  static_cast<float>(...) (readability/casting).
+
+transformer_engine/common/util/logging.h
+  -NVTE_CHECK_HIPBLASLT macro: std::to_string((int)status) ->
+  std::to_string(static_cast<int>(status)) (readability/casting).
+
+transformer_engine/pytorch/csrc/extensions/gemm.cpp
+  -Comm overlap RS path: HIP p2p vs split_overlap_rs restructured with proper
+  #else for non-HIP so } else #endif { does not confuse brace rules
+  (readability/braces).
@@ -61,7 +61,11 @@ union _te_hip_fp8 {
   __device__ operator float() const;
 
   __host__ _te_hip_fp8<FNUZ, OCP>(const float& v) {
-    if (te_fp8_fnuz()) fnuz=v; else ocp=v;
+    if (te_fp8_fnuz()) {
+      fnuz = v;
+    } else {
+      ocp = v;
+    }
   }
   __device__ _te_hip_fp8<FNUZ, OCP>(const float& v);
 };

@@ -7,6 +7,8 @@
 // drop-in replacement for rocm quantize_mxfp8 kernels
 //#include "hip/hip_runtime.h" //dummy include to prevent hipification adding this header
 
+#include <cstdint>
+
 constexpr size_t MXFP8_CHUNK_DIM_Y = 64;
 constexpr size_t MXFP8_CHUNK_DIM_X = 64;
 constexpr size_t MXFP8_THREADS_PER_CHUNK = 64;
@@ -15,7 +17,7 @@ constexpr size_t ELEMS_PER_THREAD = 16;
 constexpr size_t MXFP8_BUFFER_DIM_Y = 32;  // only 32 is supported
 
 #if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
-typedef short mxfp8_v2i16_t __attribute__((ext_vector_type(2)));
+typedef int16_t mxfp8_v2i16_t __attribute__((ext_vector_type(2)));
 #endif
 
 template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename ParamOP,

@@ -76,7 +76,13 @@ void set_aiter_asm_dir() {
   static std::once_flag aiter_asm_dir_once;
   std::call_once(aiter_asm_dir_once, []() {
     Dl_info info;
-    dladdr((void*)set_aiter_asm_dir, &info);
+    // dladdr expects void*; avoid reinterpret_cast<void*>(fn) (not ISO C++).
+    union {
+      void (*fn)();
+      void *addr;
+    } sym{};
+    sym.fn = set_aiter_asm_dir;
+    dladdr(sym.addr, &info);
     const char* log_ck_config_env = std::getenv("NVTE_LOG_CK_CONFIG");
     bool log_ck_config = log_ck_config_env && std::string(log_ck_config_env) == "1";
     // Check if user has set AITER_ASM_DIR, if yes, skip auto setting and log
@@ -130,9 +136,10 @@ std::ostream* get_ck_log_stream() {
       if (!log_dir_str.empty() && log_dir_str != "0") {
         if (log_dir_str == "1") {
           log_stream = &std::cout;
-        }
-        else if (open_ck_fused_attn_log_file(log_file, "ck_fused_attn", log_dir_str)) {
-          log_stream = &log_file;
+        } else {
+          if (open_ck_fused_attn_log_file(log_file, "ck_fused_attn", log_dir_str)) {
+            log_stream = &log_file;
+          }
         }
       }
     }

@@ -146,26 +146,26 @@ std::pair<int64_t, int64_t> check_set_window_size(NVTE_Mask_Type attn_mask_type,
       nvte_log_fused_attn_config = true;
   }
   if(attn_mask_type==NVTE_CAUSAL_MASK || attn_mask_type==NVTE_PADDING_CAUSAL_MASK || attn_mask_type==NVTE_CAUSAL_BOTTOM_RIGHT_MASK || attn_mask_type==NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK){
-    if(window_size==std::make_pair<int64_t, int64_t>(-1, -1) || (window_size.first >=0 && window_size.second!=0)){
+    if(window_size==std::pair<int64_t, int64_t>(-1, -1) || (window_size.first >=0 && window_size.second!=0)){
       //TODO: better INFO logging
       if(nvte_log_fused_attn_config){
         std::cout<<"window_size should be (-1, 0) or (>=0, 0) for attn_mask_type="<<attn_mask_type<<std::endl;
       }
       window_size.second = 0;
       return window_size;
-    }else if( window_size!=std::make_pair<int64_t, int64_t>(-1, 0) && (window_size.first < 0 || window_size.second != 0)){
+    }else if( window_size!=std::pair<int64_t, int64_t>(-1, 0) && (window_size.first < 0 || window_size.second != 0)){
       NVTE_ERROR("window_size should be (-1, 0) or (>=0, 0) for attn_mask_type=" + std::to_string(attn_mask_type));
     }
   }else if(attn_mask_type==NVTE_NO_MASK || attn_mask_type==NVTE_PADDING_MASK){
     //no_mask and padding mask
-    if(window_size==std::make_pair<int64_t, int64_t>(-1, 0)){
+    if(window_size==std::pair<int64_t, int64_t>(-1, 0)){
       //TODO: better INFO logging
       if(nvte_log_fused_attn_config){
         std::cout<<"window_size should be (-1, -1) or (>=0, >=0) for attn_mask_type="<<attn_mask_type<<std::endl;
       }
       window_size.second=-1;
       return window_size;
-    }else if(window_size!=std::make_pair<int64_t, int64_t>(-1, -1) && (window_size.first < 0 or window_size.second < 0)){
+    }else if(window_size!=std::pair<int64_t, int64_t>(-1, -1) && (window_size.first < 0 || window_size.second < 0)){
       NVTE_ERROR("window_size should be (-1, -1) or (>=0, >=0) for attn_mask_type=" + std::to_string(attn_mask_type)); 
     }
   }else{
@@ -267,7 +267,7 @@ void log_fused_attn_config(
   std::cout<<"d_qk: "<<head_dim_qk<<", ";
   std::cout<<"d_v: "<<head_dim_v<<", ";
   std::cout<<"(window_size_left, window_size_right): ("<<window_size_left<<", "<<window_size_right<<") ";
-  if(window_size_left >0 or window_size_right >0){
+  if(window_size_left >0 || window_size_right >0){
     std::cout<<", (sliding window)";
   }
   std::cout<<std::endl;

@@ -117,7 +117,7 @@ public:
     data[key][stream] = item; 
   }
 
-  ObjCache(void (*a_offload)(const Data&)): offload(a_offload) {}
+  explicit ObjCache(void (*a_offload)(const Data&)): offload(a_offload) {}
 
   ~ObjCache()
   {
@@ -461,7 +461,7 @@ template<typename T>
 class NameMapper
 {
 public:
-  NameMapper(const std::unordered_map<T, std::string_view>& name_map): map(name_map) {}
+  explicit NameMapper(const std::unordered_map<T, std::string_view>& name_map): map(name_map) {}
   const std::string_view &getName(const T &val) {
     return map.at(val);
   }
@@ -769,14 +769,17 @@ protected:
       }
 
 #if HIPBLASLT_VERSION_MAJOR > 0 || HIPBLASLT_VERSION_MINOR >= 15
-      if (cfg.scaling_mode < 0 || cfg.scaling_mode >= (int)HIPBLASLT_MATMUL_MATRIX_SCALE_END)
+      if (cfg.scaling_mode < 0 ||
+          cfg.scaling_mode >= static_cast<int>(HIPBLASLT_MATMUL_MATRIX_SCALE_END)) {
+        std::cout << "[WARNING] Unsupported scaling mode at " << line << "\n";
+        continue;
+      }
 #else
-      if (cfg.scaling_mode != 0)
-#endif
-      {
+      if (cfg.scaling_mode != 0) {
         std::cout << "[WARNING] Unsupported scaling mode at " << line << "\n";
         continue;
       }
+#endif
 
       auto fp8_filter = te_fp8_fnuz()
                             ? [](const hipDataType& val) 
@@ -966,10 +969,10 @@ void hipblaslt_gemm(const Tensor *inputA,
     std::cout << "m=" << m << " k=" << k << " n=" << n 
         << " transa=" << (param.transA == HIPBLAS_OP_T ? "T" : "N")
         << " transb=" << (param.transB == HIPBLAS_OP_T ? "T" : "N")
-        << " A_type=" << (int)(param.Atype)
-        << " B_type=" << (int)(param.Btype)
-        << " D_type=" << (int)outputD->data.dtype
-        << " bias_type=" << (int)inputBias->data.dtype
+        << " A_type=" << static_cast<int>(param.Atype)
+        << " B_type=" << static_cast<int>(param.Btype)
+        << " D_type=" << static_cast<int>(outputD->data.dtype)
+        << " bias_type=" << static_cast<int>(inputBias->data.dtype)
         << " grad=" << grad
         << " bias=" << (inputBias->data.dptr != nullptr)
         << " gelu=" << (outputPreGelu->data.dptr != nullptr)
@@ -1386,7 +1389,7 @@ void hipblaslt_gemm(const Tensor *inputA,
 }
 
 
-typedef unsigned long long ServiceStreamKey;
+using ServiceStreamKey = std::uint64_t;
 
 ServiceStreamKey make_service_stream_key(const int device_id, const int cu_count) {
   return (static_cast<ServiceStreamKey>(device_id) << 32) | static_cast<ServiceStreamKey>(cu_count);

@@ -544,24 +544,26 @@ NormalizationPlanBase* NormalizationPlanRegistry::getNormalizationPlan(
     plan = std::make_unique<CudnnNormalizationPlan>(NormType, NormStage, wtype, itype, otype, ctype,
                                                     batch_size, hidden_size, sm_count,
                                                     zero_centered_gamma, mode, training);
-  } else
+  }
 #endif
-  if (NormStage == NVTE_Norm_Stage::Forward) {
-    plan = std::make_unique<TeNormalizationPlan<ForwardKernelParams>>(
-        NormType, NormStage, wtype, itype, otype, ctype, batch_size, hidden_size, sm_count,
-        zero_centered_gamma, is_tuned
+  if (!plan) {
+    if (NormStage == NVTE_Norm_Stage::Forward) {
+      plan = std::make_unique<TeNormalizationPlan<ForwardKernelParams>>(
+          NormType, NormStage, wtype, itype, otype, ctype, batch_size, hidden_size, sm_count,
+          zero_centered_gamma, is_tuned
 #ifdef __HIP_PLATFORM_AMD__
-        , mode, training
+          , mode, training
 #endif
-      );
-  } else {
-    plan = std::make_unique<TeNormalizationPlan<BackwardKernelParams>>(
-        NormType, NormStage, wtype, itype, otype, ctype, batch_size, hidden_size, sm_count,
-        zero_centered_gamma, is_tuned
+        );
+    } else {
+      plan = std::make_unique<TeNormalizationPlan<BackwardKernelParams>>(
+          NormType, NormStage, wtype, itype, otype, ctype, batch_size, hidden_size, sm_count,
+          zero_centered_gamma, is_tuned
 #ifdef __HIP_PLATFORM_AMD__
-        , mode, training
+          , mode, training
 #endif
-        );
+          );
+    }
   }
   normalizationPlanMap.insert({key, std::move(plan)});
   return normalizationPlanMap[key].get();

@@ -68,7 +68,7 @@ void layernorm_fwd(const Tensor& x,      // BxSxhidden_size
     CheckOutputTensor(*rsigma, "rsigma");
   }
 
-  NVTE_Norm_Backend norm_backend;
+  NVTE_Norm_Backend norm_backend = NVTE_Norm_Backend::Te;
   bool is_aligned = true;
 #ifndef __HIP_PLATFORM_AMD__
   bool cudnn_backend = use_cudnn_norm_fwd() || is_mxfp8_scaling(z->scaling_mode);
@@ -85,10 +85,9 @@ void layernorm_fwd(const Tensor& x,      // BxSxhidden_size
     // TODO: add check for GPU ARCH
     norm_backend = NVTE_Norm_Backend::Cudnn;
     gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype();
-  } else
+  }
 #endif //__HIP_PLATFORM_AMD__
-  {
-    norm_backend = NVTE_Norm_Backend::Te;
+  if (norm_backend == NVTE_Norm_Backend::Te) {
     is_aligned = is_ptr_aligned(z->data.dptr, x.data.dptr, gamma.data.dptr, beta.data.dptr,
                                 mu->data.dptr, rsigma->data.dptr);
   }
@@ -169,18 +168,17 @@ void layernorm_bwd(const Tensor& dz, const Tensor& x, const Tensor& mu, const Te
     CheckOutputTensor(*dbeta, "dbeta");
   }
 
-  NVTE_Norm_Backend norm_backend;
+  NVTE_Norm_Backend norm_backend = NVTE_Norm_Backend::Te;
   bool is_aligned = true;
   bool gamma_in_weight_dtype = false;
 #ifndef __HIP_PLATFORM_AMD__
   if (use_cudnn_norm_bwd()) {
     // TODO: add check for GPU ARCH
     norm_backend = NVTE_Norm_Backend::Cudnn;
     gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype();
-  } else
+  }
 #endif
-  {
-    norm_backend = NVTE_Norm_Backend::Te;
+  if (norm_backend == NVTE_Norm_Backend::Te) {
     is_aligned = is_ptr_aligned(x.data.dptr, gamma.data.dptr, mu.data.dptr, rsigma.data.dptr,
                                 dx->data.dptr, dz.data.dptr, dbeta->data.dptr, dgamma->data.dptr);
   }

@@ -54,7 +54,7 @@ void rmsnorm_fwd(const Tensor &x, const Tensor &gamma, const float epsilon, Tens
     CheckOutputTensor(*rsigma, "rsigma");
   }
 
-  NVTE_Norm_Backend norm_backend;
+  NVTE_Norm_Backend norm_backend = NVTE_Norm_Backend::Te;
   bool is_aligned = true;
 #ifndef __HIP_PLATFORM_AMD__
   bool cudnn_backend = use_cudnn_norm_fwd() || is_mxfp8_scaling(z->scaling_mode);
@@ -76,10 +76,9 @@ void rmsnorm_fwd(const Tensor &x, const Tensor &gamma, const float epsilon, Tens
     // TODO: add check for GPU ARCH
     norm_backend = NVTE_Norm_Backend::Cudnn;
     gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype();
-  } else
+  }
 #endif
-  {
-    norm_backend = NVTE_Norm_Backend::Te;
+  if (norm_backend == NVTE_Norm_Backend::Te) {
     is_aligned = is_ptr_aligned(z->data.dptr, x.data.dptr, gamma.data.dptr, rsigma->data.dptr);
   }
 
@@ -148,18 +147,17 @@ void rmsnorm_bwd(const Tensor &dz, const Tensor &x, const Tensor &rsigma, const
     CheckOutputTensor(*dgamma, "dgamma");
   }
 
-  NVTE_Norm_Backend norm_backend;
+  NVTE_Norm_Backend norm_backend = NVTE_Norm_Backend::Te;
   bool is_aligned = true;
   bool gamma_in_weight_dtype = false;
 #ifndef __HIP_PLATFORM_AMD__
   if (use_cudnn_norm_bwd()) {
     // TODO: add check for GPU ARCH
     norm_backend = NVTE_Norm_Backend::Cudnn;
     gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype();
-  } else
+  }
 #endif
-  {
-    norm_backend = NVTE_Norm_Backend::Te;
+  if (norm_backend == NVTE_Norm_Backend::Te) {
     is_aligned = is_ptr_aligned(x.data.dptr, gamma.data.dptr, rsigma.data.dptr, dx->data.dptr,
                                 dz.data.dptr, dgamma->data.dptr);
   }

@@ -81,12 +81,12 @@ __global__ void moe_unpermute_kernel(const T *input, T *unpermuted_output, const
 #endif
 
       for (int e = 0; e < kElementsPerAccess; e++) {
-        frag_sum[e] = float(TCompute(frag_load_store_ptr[e]));
+        frag_sum[e] = static_cast<float>(TCompute(frag_load_store_ptr[e]));
       }
 
       if (hasProb) {
         for (int e = 0; e < kElementsPerAccess; e++) {
-          frag_sum[e] = frag_sum[e] * float(s_prob[0]);
+          frag_sum[e] = frag_sum[e] * static_cast<float>(s_prob[0]);
         }
       }
     } else {
@@ -120,7 +120,7 @@ __global__ void moe_unpermute_kernel(const T *input, T *unpermuted_output, const
       }
 
       for (int e = 0; e < kElementsPerAccess; e++) {
-        frag_sum[e] += float(frag_elem[e]);
+        frag_sum[e] += static_cast<float>(frag_elem[e]);
       }
     }
 
@@ -129,7 +129,7 @@ __global__ void moe_unpermute_kernel(const T *input, T *unpermuted_output, const
     for (int e = 0; e < kElementsPerAccess; e++) {
       if constexpr ((std::is_same_v<T, transformer_engine::fp8e4m3> || std::is_same_v<T, transformer_engine::fp8e5m2>) &&
                     (!hasProb)) {
-        frag_sum[e] = frag_sum[e] / float(TCompute(topK));
+        frag_sum[e] = frag_sum[e] / static_cast<float>(TCompute(topK));
       }
       frag_load_store_ptr[e] = T(TCompute(frag_sum[e]));
     }

@@ -68,7 +68,7 @@
     const hipblasStatus_t status_NVTE_CHECK_CUBLAS = (expr);            \
     if (status_NVTE_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS) {            \
       NVTE_ERROR("HIPBLASLT Error: ",                                   \
-                 std::to_string((int)status_NVTE_CHECK_CUBLAS));        \
+                 std::to_string(static_cast<int>(status_NVTE_CHECK_CUBLAS))); \
     }                                                                   \
   } while (false)
 #else //cublas