Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions scripts/convert-minimax-m2-hf-to-megatron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash
# ============================================================
# Script 1/3: HF -> Megatron Weight Conversion
# MiniMax-M2.5 (229B MoE)
# ============================================================
set -ex

# ---- Paths (modify according to your environment) ----
HF_CKPT=${HF_CKPT:-"/root/MiniMax-M2.5"}
SAVE_DIR=${SAVE_DIR:-"/root/MiniMax-M2.5_torch_dist"}

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
source "${SCRIPT_DIR}/models/minimax-m2.sh"

# ---- Parallelism config (adjust based on available GPUs) ----
TP=${TP:-2}
PP=${PP:-2}
EP=${EP:-4}
WORLD_SIZE=${WORLD_SIZE:-$((TP * PP * EP))}
NNODES=${NNODES:-1}
NPROC_PER_NODE=${NPROC_PER_NODE:-$((WORLD_SIZE / NNODES))}
NODE_RANK=${NODE_RANK:-0}
MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
MASTER_PORT=${MASTER_PORT:-29500}

if (( NPROC_PER_NODE * NNODES != WORLD_SIZE )); then
echo "NPROC_PER_NODE * NNODES must equal WORLD_SIZE (${WORLD_SIZE})." >&2
exit 1
fi

# WORLD_SIZE must match the requested Megatron parallel layout.
torchrun \
--nproc-per-node ${NPROC_PER_NODE} \
--nnodes ${NNODES} \
--node-rank ${NODE_RANK} \
--master-addr ${MASTER_ADDR} \
--master-port ${MASTER_PORT} \
tools/convert_hf_to_torch_dist.py \
${MODEL_ARGS[@]} \
--hf-checkpoint ${HF_CKPT} \
--save ${SAVE_DIR} \
--megatron-to-hf-mode raw \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--expert-model-parallel-size ${EP} \
--expert-tensor-parallel-size 1
18 changes: 18 additions & 0 deletions scripts/convert-minimax-m2-megatron-to-hf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
# ============================================================
# Script 3/3: Megatron -> HF Weight Conversion
# MiniMax-M2.5 (229B MoE)
# ============================================================
set -ex

# ---- Paths (modify according to your environment) ----
HF_CKPT=${HF_CKPT:-"/root/MiniMax-M2.5"}
MEGATRON_CKPT=${MEGATRON_CKPT:-"/root/MiniMax-M2.5_slime"}
INPUT_DIR=${INPUT_DIR:-"${MEGATRON_CKPT}/release"}
SAVE_DIR=${SAVE_DIR:-"/root/MiniMax-M2.5_hf_output"}

python tools/convert_torch_dist_to_hf.py \
--input-dir ${INPUT_DIR} \
--output-dir ${SAVE_DIR} \
--origin-hf-dir ${HF_CKPT} \
--vocab-size 200064
39 changes: 39 additions & 0 deletions scripts/models/minimax-m2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# MiniMax-M2.5 (229B, 62 layers, 256 experts, top-8)
MODEL_ARGS=(
--spec "slime_plugins.models.minimax_m2" "get_minimax_m2_layer_spec"
--disable-bias-linear
--num-layers 62
--hidden-size 3072
--ffn-hidden-size 1536
--num-attention-heads 48
--kv-channels 128
--num-query-groups 8
--normalization RMSNorm
--position-embedding-type rope
--norm-epsilon 1e-6
--swiglu
--untie-embeddings-and-output-weights
--vocab-size 200064
--group-query-attention

--rotary-percent 0.5
--rotary-base 5000000
--qk-layernorm
--no-rope-fusion
--attention-softmax-in-fp32

# MoE
--num-experts 256
--moe-ffn-hidden-size 1536
--moe-router-topk 8
--moe-layer-freq "[1]*62"
--moe-router-pre-softmax
--moe-router-score-function sigmoid
--moe-router-enable-expert-bias
--moe-router-load-balancing-type none
--moe-token-dispatcher-type alltoall
--moe-router-dtype fp32
--moe-aux-loss-coeff 0
--moe-grouped-gemm
--moe-permute-fusion
)
171 changes: 171 additions & 0 deletions scripts/run-minimax-m2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#!/bin/bash
# ============================================================
# Script 2/3: MiniMax-M2.5 (229B MoE) RL Training
# ============================================================

# for rerun the task
pkill -9 sglang
sleep 3
ray stop --force
pkill -9 ray
pkill -9 python
sleep 3
pkill -9 ray
pkill -9 python

set -ex

export PYTHONBUFFERED=16

NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
if [ "$NVLINK_COUNT" -gt 0 ]; then
HAS_NVLINK=1
else
HAS_NVLINK=0
fi
echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
source "${SCRIPT_DIR}/models/minimax-m2.sh"

# ---- Paths (modify according to your environment) ----
BASE_DIR=${BASE_DIR:-"/root"}

CKPT_ARGS=(
--hf-checkpoint ${BASE_DIR}/MiniMax-M2.5
--ref-load ${BASE_DIR}/MiniMax-M2.5_torch_dist
--load ${BASE_DIR}/MiniMax-M2.5_slime/
--save ${BASE_DIR}/MiniMax-M2.5_slime/
--save-interval 20
--megatron-to-hf-mode raw
--model-name minimax_m2
)

ROLLOUT_ARGS=(
--prompt-data ${BASE_DIR}/dapo-math-17k/dapo-math-17k.jsonl
--input-key prompt
--label-key label
--apply-chat-template
--rollout-shuffle
--rm-type deepscaler
--num-rollout 3000
--rollout-batch-size 128
--n-samples-per-prompt 8
--rollout-max-response-len 32768
--rollout-temperature 1

--over-sampling-batch-size 256
--dynamic-sampling-filter-path slime.rollout.filter_hub.dynamic_sampling_filters.check_reward_nonzero_std

--num-steps-per-rollout 4
--balance-data
)

EVAL_ARGS=(
--eval-interval 20
--eval-prompt-data aime ${BASE_DIR}/rl_data/aime-2024.jsonl
--n-samples-per-eval-prompt 8
--eval-max-response-len 32768
--eval-top-p 1
)

# ---- Parallelism Strategy ----
# 229B MoE, 256 experts -> requires many GPUs
# Typical config: TP=2, PP=2, EP=4, training side 16 GPUs (2 nodes x 8 GPUs)
# Inference side: SGLang on separate GPUs, EP=16+
PERF_ARGS=(
--tensor-model-parallel-size 2
--sequence-parallel
--pipeline-model-parallel-size 2
--context-parallel-size 1
--expert-model-parallel-size 4
--expert-tensor-parallel-size 1

--recompute-granularity full
--recompute-method uniform
--recompute-num-layers 1

--use-dynamic-batch-size
--max-tokens-per-gpu 8192
)

GRPO_ARGS=(
--advantage-estimator grpo
--use-kl-loss
--kl-loss-coef 0.00
--kl-loss-type low_var_kl
--entropy-coef 0.00
--eps-clip 0.2
--eps-clip-high 0.28
)

OPTIMIZER_ARGS=(
--optimizer adam
--lr 1e-6
--lr-decay-style constant
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.98

--optimizer-cpu-offload
--overlap-cpu-optimizer-d2h-h2d
--use-precision-aware-optimizer
)

WANDB_ARGS=(
# --use-wandb
# --wandb-project slime-dev
# --wandb-group minimax-m2-rl
# --wandb-key ${WANDB_KEY}
)

TB_ARGS=(
--use-tensorboard
)

SGLANG_ARGS=(
--rollout-num-gpus-per-engine 16
--sglang-mem-fraction-static 0.7
--sglang-ep-size 16
)

MISC_ARGS=(
--attention-dropout 0.0
--hidden-dropout 0.0
--accumulate-allreduce-grads-in-fp32
--attention-softmax-in-fp32
--attention-backend flash
)

# launch the master node of ray in container
export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
export no_proxy="127.0.0.1,${MASTER_ADDR}"
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265

RUNTIME_ENV_JSON="{
\"env_vars\": {
\"no_proxy\": \"localhost,127.0.0.1,0.0.0.0,${MASTER_ADDR}\",
\"MASTER_ADDR\": \"${MASTER_ADDR}\",
\"PYTHONPATH\": \"/root/Megatron-LM/\",
\"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
\"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\"
}
}"

ray job submit --address="http://127.0.0.1:8265" \
--runtime-env-json="${RUNTIME_ENV_JSON}" \
-- python3 train.py \
--actor-num-nodes 16 \
--actor-num-gpus-per-node 8 \
--colocate \
${MODEL_ARGS[@]} \
${CKPT_ARGS[@]} \
${ROLLOUT_ARGS[@]} \
${OPTIMIZER_ARGS[@]} \
${GRPO_ARGS[@]} \
${WANDB_ARGS[@]} \
${TB_ARGS[@]} \
${PERF_ARGS[@]} \
${EVAL_ARGS[@]} \
${SGLANG_ARGS[@]} \
${MISC_ARGS[@]}
5 changes: 4 additions & 1 deletion slime/backends/megatron_utils/megatron_to_hf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .gpt_oss import convert_gpt_oss_to_hf
from .llama import convert_llama_to_hf
from .mimo import convert_mimo_to_hf
from .minimax_m2 import convert_minimax_m2_to_hf
from .processors import quantize_params, remove_padding
from .qwen2 import convert_qwen2_to_hf
from .qwen3_5 import convert_qwen3_5_to_hf
Expand Down Expand Up @@ -34,7 +35,9 @@ def convert_to_hf(args, model_name, name, param, quantization_config=None):

# TODO optimize code details
def _convert_to_hf_core(args, model_name, name, param):
if "glm4moelite" in model_name or "deepseekv3" in model_name:
if "minimaxm2" in model_name or "minimax_m2" in model_name:
converted_named_tensors = convert_minimax_m2_to_hf(args, name, param)
elif "glm4moelite" in model_name or "deepseekv3" in model_name:
converted_named_tensors = convert_deepseekv3_to_hf(args, name, param)
elif "glm4moe" in model_name:
converted_named_tensors = convert_glm4moe_to_hf(args, name, param)
Expand Down
88 changes: 88 additions & 0 deletions slime/backends/megatron_utils/megatron_to_hf/minimax_m2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import re

import torch


def convert_minimax_m2_to_hf(args, name, param):
"""Convert Megatron parameter names/tensors to HuggingFace format for MiniMax-M2.5.

HF uses `block_sparse_moe` prefix with expert naming w1(gate)/w2(down)/w3(up).
Custom SelfAttention uses `q_norm`/`k_norm` (not `q_layernorm`/`k_layernorm`).
"""
# Direct mappings
if name == "module.module.embedding.word_embeddings.weight":
return [("model.embed_tokens.weight", param)]
if name == "module.module.output_layer.weight":
return [("lm_head.weight", param)]
if name == "module.module.decoder.final_layernorm.weight":
return [("model.norm.weight", param)]

try:
head_dim = args.kv_channels if args.kv_channels is not None else args.hidden_size // args.num_attention_heads
except AttributeError:
head_dim = args.hidden_size // args.num_attention_heads
value_num_per_group = args.num_attention_heads // args.num_query_groups

decoder_layers_pattern = r"module\.module\.decoder\.layers\.(\d+)\.(.+)"
match = re.match(decoder_layers_pattern, name)
if match:
layer_idx, rest = match.groups()

# MoE experts: linear_fc1 -> w1 (gate) + w3 (up), linear_fc2 -> w2 (down)
expert_pattern = r"mlp.experts\.(.+)\.weight(\d+)"
match = re.match(expert_pattern, rest)
if match:
rest, expert_idx = match.groups()
if rest == "linear_fc1":
gate_weight, up_weight = param.chunk(2, dim=0)
return [
(f"model.layers.{layer_idx}.block_sparse_moe.experts.{expert_idx}.w1.weight", gate_weight),
(f"model.layers.{layer_idx}.block_sparse_moe.experts.{expert_idx}.w3.weight", up_weight),
]
elif rest == "linear_fc2":
return [
(f"model.layers.{layer_idx}.block_sparse_moe.experts.{expert_idx}.w2.weight", param),
]
else:
raise ValueError(f"Unknown expert parameter name: {name}")

# Attention: o_proj
if rest == "self_attention.linear_proj.weight":
return [(f"model.layers.{layer_idx}.self_attn.o_proj.weight", param)]

# Attention: fused QKV -> split into Q/K/V (GQA: 48 heads, 8 kv heads)
elif rest == "self_attention.linear_qkv.weight":
param = param.view(args.num_query_groups, -1, head_dim, args.hidden_size)
q_param, k_param, v_param = torch.split(
param, split_size_or_sections=[value_num_per_group, 1, 1], dim=1
)
q_param = q_param.reshape(-1, args.hidden_size)
k_param = k_param.reshape(-1, args.hidden_size)
v_param = v_param.reshape(-1, args.hidden_size)
return [
(f"model.layers.{layer_idx}.self_attn.q_proj.weight", q_param),
(f"model.layers.{layer_idx}.self_attn.k_proj.weight", k_param),
(f"model.layers.{layer_idx}.self_attn.v_proj.weight", v_param),
]

# Input layernorm
elif rest == "self_attention.linear_qkv.layer_norm_weight":
return [(f"model.layers.{layer_idx}.input_layernorm.weight", param)]

# QK Norm (custom attention uses q_norm/k_norm, NOT q_layernorm/k_layernorm)
elif rest == "self_attention.q_norm.weight":
return [(f"model.layers.{layer_idx}.self_attn.q_norm.weight", param)]
elif rest == "self_attention.k_norm.weight":
return [(f"model.layers.{layer_idx}.self_attn.k_norm.weight", param)]

# Post-attention layernorm
elif rest == "pre_mlp_layernorm.weight":
return [(f"model.layers.{layer_idx}.post_attention_layernorm.weight", param)]

# Router
elif rest == "mlp.router.weight":
return [(f"model.layers.{layer_idx}.block_sparse_moe.gate.weight", param)]
elif rest == "mlp.router.expert_bias":
return [(f"model.layers.{layer_idx}.block_sparse_moe.e_score_correction_bias", param)]

raise ValueError(f"Unknown parameter name: {name}")
Loading
Loading