diff --git a/modelopt_recipes/models/Kimi-K2.5/dflash.yaml b/modelopt_recipes/models/Kimi-K2.5/dflash.yaml
new file mode 100644
index 00000000000..e37e7539b77
--- /dev/null
+++ b/modelopt_recipes/models/Kimi-K2.5/dflash.yaml
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Per-model DFlash offline training recipe for Kimi-K2.5.
+
+metadata:
+  recipe_type: speculative_dflash
+  description: DFlash offline training recipe for Kimi-K2.5.
+
+# maps to ModelArguments (main.py)
+model:
+  model_name_or_path: moonshotai/Kimi-K2.5
+  trust_remote_code: true
+  use_fake_base_for_offline: true
+
+# maps to DataArguments (main.py)
+data:
+  data_path:
+  offline_data_path: <path to offline data>
+  # Jinja chat template with {% generation %} tags for answer_only_loss.
+  # Required when answer_only_loss=true. Set in per-model launcher YAML.
+  # Templates are in modelopt_recipes/general/speculative_decoding/chat_templates/
+  chat_template:
+
+# maps to TrainingArguments (main.py)
+training:
+  # --- commonly modified ---
+  output_dir: ckpts/kimi-k25-dflash
+  num_train_epochs: 10
+  per_device_train_batch_size: 1
+  learning_rate: 6.0e-4
+  warmup_steps: 100
+  training_seq_len: 4096
+  logging_steps: 100
+  save_steps: 5000
+  cp_size: 1
+  dp_shard_size: 1
+  disable_tqdm: true
+  estimate_ar: false
+  ar_validate_steps: 0
+  answer_only_loss: true
+
+  # --- rarely modified ---
+  do_eval: false
+  lr_scheduler_type: linear
+  save_strategy: steps
+  weight_decay: 0.0
+  dataloader_drop_last: true
+  bf16: true
+  tf32: true
+  remove_unused_columns: false
+  ddp_find_unused_parameters: true
+  ddp_timeout: 1800
+  report_to: tensorboard
+
+# maps to DFlashConfig (modelopt/torch/speculative/config.py).
+# dflash_mask_token_id falls back to tokenizer.mask_token_id when unset; set
+# explicitly here if the tokenizer does not provide one.
+dflash:
+  dflash_block_size: 8
+  dflash_num_anchors: 512
+  dflash_use_torch_compile: false
+  dflash_self_logit_distillation: true
+  dflash_loss_decay_factor: 4.0
+  dflash_architecture_config:
+    num_hidden_layers: 5
+    # sliding_window and layer_types are inherited from base model config automatically
diff --git a/modelopt_recipes/models/Kimi-K2.5/eagle3.yaml b/modelopt_recipes/models/Kimi-K2.5/eagle3.yaml
new file mode 100644
index 00000000000..64fb046cdc5
--- /dev/null
+++ b/modelopt_recipes/models/Kimi-K2.5/eagle3.yaml
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Per-model EAGLE3 offline training recipe for Kimi-K2.5.
+# Mirrors examples/speculative_decoding/scripts/train_kimi_k25_offline.sh.
+
+metadata:
+  recipe_type: speculative_eagle
+  description: EAGLE3 offline training recipe for Kimi-K2.5.
+
+# maps to ModelArguments (main.py)
+model:
+  model_name_or_path: moonshotai/Kimi-K2.5
+  trust_remote_code: true
+  use_fake_base_for_offline: true
+
+# maps to DataArguments (main.py)
+data:
+  data_path: input_conversations/train.jsonl
+  offline_data_path: <path to offline data>
+  draft_vocab_cache:
+  vlm_img_dir:
+  vlm_processor:
+
+# maps to TrainingArguments (main.py)
+training:
+  # --- commonly modified ---
+  output_dir: ckpts/kimi-k25-eagle3
+  num_train_epochs: 1
+  per_device_train_batch_size: 1
+  learning_rate: 1.0e-4
+  warmup_steps: 1000
+  training_seq_len: 4096
+  logging_steps: 100
+  save_steps: 8192
+  cp_size: 1
+  disable_tqdm: false
+  estimate_ar: false
+  ar_validate_steps: -1
+  answer_only_loss: false
+
+  # --- rarely modified ---
+  do_eval: false
+  lr_scheduler_type: linear
+  save_strategy: steps
+  weight_decay: 0.0
+  dataloader_drop_last: true
+  bf16: true
+  tf32: true
+  remove_unused_columns: false
+
+# maps to EagleConfig (modelopt/torch/speculative/config.py).
+eagle:
+  # eagle_offline is derived from data.offline_data_path; do not set here.
+  eagle_decoder_type: kimik2
+  eagle_ttt_steps: 3
+  eagle_mix_hidden_states: false
+  eagle_use_torch_compile: true
+  eagle_self_logit_distillation: true
+  eagle_freeze_base_model: true
+  eagle_loss_decay_factor: 0.9
+  eagle_hidden_state_distillation: false
+  eagle_reuse_base_decoder: false
+  eagle_report_acc: true
+  eagle_enable_nvtx: false
+  # Rope scaling: disable during training (default_config.py uses rope_type=default),
+  # inject YaRN during export for long-context inference.
+  eagle_export_rope_scaling:
+    rope_type: yarn
+    factor: 32.0
+    original_max_position_embeddings: 2048
+  # overwrite to modelopt/torch/speculative/eagle/default_config.py
+  eagle_architecture_config: {}
diff --git a/tools/launcher/examples/Qwen/Qwen3-0.6B/chat_template_train.jinja b/modelopt_recipes/models/Qwen3-0.6B/chat_template_train.jinja
similarity index 100%
rename from tools/launcher/examples/Qwen/Qwen3-0.6B/chat_template_train.jinja
rename to modelopt_recipes/models/Qwen3-0.6B/chat_template_train.jinja
diff --git a/modelopt_recipes/models/Qwen3-0.6B/dflash.yaml b/modelopt_recipes/models/Qwen3-0.6B/dflash.yaml
new file mode 100644
index 00000000000..8665fa16d6d
--- /dev/null
+++ b/modelopt_recipes/models/Qwen3-0.6B/dflash.yaml
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Per-model DFlash training recipe for Qwen3-0.6B.
+
+metadata:
+  recipe_type: speculative_dflash
+  description: DFlash training recipe for Qwen3-0.6B.
+
+# maps to ModelArguments (main.py)
+model:
+  model_name_or_path: Qwen/Qwen3-0.6B
+  trust_remote_code: false
+  use_fake_base_for_offline: false
+
+# maps to DataArguments (main.py)
+data:
+  data_path:
+  offline_data_path:
+
+# maps to TrainingArguments (main.py)
+training:
+  output_dir:
+  training_seq_len: 512
+  answer_only_loss: true
+
+# maps to DFlashConfig (modelopt/torch/speculative/config.py).
+dflash:
+  dflash_block_size: 8
+  dflash_mask_token_id: 151669  # Qwen3 tokenizer mask token
+  dflash_architecture_config:
+    num_hidden_layers: 2  # small draft for 0.6B base
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/chat_template_train.jinja b/modelopt_recipes/models/Qwen3-8B/chat_template_train.jinja
similarity index 100%
rename from tools/launcher/examples/Qwen/Qwen3-8B/chat_template_train.jinja
rename to modelopt_recipes/models/Qwen3-8B/chat_template_train.jinja
diff --git a/modelopt_recipes/models/Qwen3-8B/dflash.yaml b/modelopt_recipes/models/Qwen3-8B/dflash.yaml
new file mode 100644
index 00000000000..ba15a346ab3
--- /dev/null
+++ b/modelopt_recipes/models/Qwen3-8B/dflash.yaml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Per-model DFlash training recipe for Qwen3-8B.
+
+metadata:
+  recipe_type: speculative_dflash
+  description: DFlash training recipe for Qwen3-8B.
+
+# maps to ModelArguments (main.py)
+model:
+  model_name_or_path: Qwen/Qwen3-8B
+  trust_remote_code: false
+  use_fake_base_for_offline: false
+
+# maps to DataArguments (main.py)
+data:
+  data_path:
+  offline_data_path:
+
+# maps to TrainingArguments (main.py)
+training:
+  output_dir:
+  training_seq_len: 4096
+
+# maps to DFlashConfig (modelopt/torch/speculative/config.py).
+dflash:
+  dflash_block_size: 16
+  dflash_loss_decay_factor: 7.0  # paper Eq.4: gamma=7 pairs with block_size=16
+  dflash_mask_token_id: 151669   # Qwen3 tokenizer mask token
diff --git a/modelopt_recipes/models/Qwen3-8B/eagle3.yaml b/modelopt_recipes/models/Qwen3-8B/eagle3.yaml
new file mode 100644
index 00000000000..91de6c13594
--- /dev/null
+++ b/modelopt_recipes/models/Qwen3-8B/eagle3.yaml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Per-model EAGLE3 training recipe for Qwen3-8B.
+# Used by both online and offline EAGLE3 pipelines; the launcher YAML supplies
+# data.data_path (online) or data.offline_data_path (offline).
+
+metadata:
+  recipe_type: speculative_eagle
+  description: EAGLE3 training recipe for Qwen3-8B.
+
+# maps to ModelArguments (main.py)
+model:
+  model_name_or_path: Qwen/Qwen3-8B
+  trust_remote_code: false
+  use_fake_base_for_offline: false
+
+# maps to DataArguments (main.py)
+data:
+  data_path:
+  offline_data_path:
+
+# maps to TrainingArguments (main.py)
+training:
+  output_dir:
+  training_seq_len: 4096
+
+# maps to EagleConfig (modelopt/torch/speculative/config.py).
+# Qwen3 uses the llama-family decoder, which is the EagleConfig default.
+eagle: {}
diff --git a/tools/launcher/examples/Qwen/Qwen3.5-4B/chat_template_train.jinja b/modelopt_recipes/models/Qwen3.5-4B/chat_template_train.jinja
similarity index 100%
rename from tools/launcher/examples/Qwen/Qwen3.5-4B/chat_template_train.jinja
rename to modelopt_recipes/models/Qwen3.5-4B/chat_template_train.jinja
diff --git a/modelopt_recipes/models/Qwen3.5-4B/dflash.yaml b/modelopt_recipes/models/Qwen3.5-4B/dflash.yaml
new file mode 100644
index 00000000000..849a6757bc3
--- /dev/null
+++ b/modelopt_recipes/models/Qwen3.5-4B/dflash.yaml
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Per-model DFlash training recipe for Qwen3.5-4B.
+#
+# NOTE: Qwen3.5-4B has non-standard head_dim=160. The draft model overrides the
+# attention architecture (32 heads, head_dim=128) for vLLM KV cache compatibility.
+
+metadata:
+  recipe_type: speculative_dflash
+  description: DFlash training recipe for Qwen3.5-4B (head_dim workaround for vLLM).
+
+# maps to ModelArguments (main.py)
+model:
+  model_name_or_path: Qwen/Qwen3.5-4B
+  trust_remote_code: false
+  use_fake_base_for_offline: false
+
+# maps to DataArguments (main.py)
+data:
+  data_path:
+  offline_data_path:
+
+# maps to TrainingArguments (main.py)
+training:
+  output_dir:
+  training_seq_len: 4096
+
+# maps to DFlashConfig (modelopt/torch/speculative/config.py).
+dflash:
+  dflash_mask_token_id: 248070  # Qwen3.5 tokenizer mask token (different from Qwen3)
+  dflash_architecture_config:
+    # Override base head_dim=160 to head_dim=128 for vLLM KV cache compatibility.
+    num_attention_heads: 32
+    num_key_value_heads: 8
+    head_dim: 128
+    intermediate_size: 9728
+    rope_theta: 10000000
diff --git a/tests/regression/torch/speculative/test_dflash.py b/tests/regression/torch/speculative/test_dflash.py
index 2e8092d62f5..7b7938bb9a4 100644
--- a/tests/regression/torch/speculative/test_dflash.py
+++ b/tests/regression/torch/speculative/test_dflash.py
@@ -31,34 +31,23 @@
 import pytest
 from _test_utils.examples.run_command import MODELOPT_ROOT, run_example_command
 
-DFLASH_YAML = str(
-    MODELOPT_ROOT / "modelopt_recipes" / "general" / "speculative_decoding" / "dflash.yaml"
-)
+DFLASH_YAML = str(MODELOPT_ROOT / "modelopt_recipes" / "models" / "Qwen3-0.6B" / "dflash.yaml")
 
 CHAT_TEMPLATE = str(
-    MODELOPT_ROOT
-    / "tools"
-    / "launcher"
-    / "examples"
-    / "Qwen"
-    / "Qwen3-0.6B"
-    / "chat_template_train.jinja"
+    MODELOPT_ROOT / "modelopt_recipes" / "models" / "Qwen3-0.6B" / "chat_template_train.jinja"
 )
 
 SYNTH_DATA = str(MODELOPT_ROOT / "examples" / "dataset" / "synthetic_conversations_1k.jsonl")
 
-# Match tools/launcher/examples/Qwen/Qwen3-0.6B/hf_online_dflash.yaml
+# Match tools/launcher/examples/Qwen/Qwen3-0.6B/hf_online_dflash.yaml. Model-specific
+# settings (block_size, mask_token_id, training_seq_len, answer_only_loss, draft
+# num_hidden_layers) live in the per-model recipe at DFLASH_YAML.
 _DFLASH_OVERRIDES = [
     f"data.data_path={SYNTH_DATA}",
     f"data.chat_template={CHAT_TEMPLATE}",
-    "training.training_seq_len=512",
     "training.per_device_train_batch_size=2",
     "training.logging_steps=100",
-    "training.answer_only_loss=true",
-    "dflash.dflash_block_size=8",
-    "dflash.dflash_mask_token_id=151669",
     "dflash.dflash_use_torch_compile=False",
-    "dflash.dflash_architecture_config.num_hidden_layers=2",
 ]
 
 
diff --git a/tests/regression/torch/speculative/test_dflash_offline.py b/tests/regression/torch/speculative/test_dflash_offline.py
index da951fdcda6..8311b73bf18 100644
--- a/tests/regression/torch/speculative/test_dflash_offline.py
+++ b/tests/regression/torch/speculative/test_dflash_offline.py
@@ -34,33 +34,23 @@
 import pytest
 from _test_utils.examples.run_command import MODELOPT_ROOT, run_example_command
 
-DFLASH_YAML = str(
-    MODELOPT_ROOT / "modelopt_recipes" / "general" / "speculative_decoding" / "dflash.yaml"
-)
+DFLASH_YAML = str(MODELOPT_ROOT / "modelopt_recipes" / "models" / "Qwen3-0.6B" / "dflash.yaml")
 
 CHAT_TEMPLATE = str(
-    MODELOPT_ROOT
-    / "tools"
-    / "launcher"
-    / "examples"
-    / "Qwen"
-    / "Qwen3-0.6B"
-    / "chat_template_train.jinja"
+    MODELOPT_ROOT / "modelopt_recipes" / "models" / "Qwen3-0.6B" / "chat_template_train.jinja"
 )
 
 SYNTH_DATA = str(MODELOPT_ROOT / "examples" / "dataset" / "synthetic_conversations_1k.jsonl")
 
 # Match _DFLASH_OVERRIDES in test_dflash.py so the offline run is comparable to online.
+# Model-specific settings live in DFLASH_YAML; only env-/run-specific knobs go here.
+# logging_steps is overridden lower than the online test so the shorter offline run
+# still produces multiple log entries.
 _DFLASH_OVERRIDES = [
     f"data.chat_template={CHAT_TEMPLATE}",
-    "training.training_seq_len=512",
     "training.per_device_train_batch_size=2",
     "training.logging_steps=50",
-    "training.answer_only_loss=true",
-    "dflash.dflash_block_size=8",
-    "dflash.dflash_mask_token_id=151669",
     "dflash.dflash_use_torch_compile=False",
-    "dflash.dflash_architecture_config.num_hidden_layers=2",
 ]
 
 # Number of conversations to dump. Smaller than the full 1K to keep dump time
diff --git a/tools/launcher/examples/Qwen/Qwen3-0.6B/hf_online_dflash.yaml b/tools/launcher/examples/Qwen/Qwen3-0.6B/hf_online_dflash.yaml
index c5534cf9ffa..d30731c6a23 100644
--- a/tools/launcher/examples/Qwen/Qwen3-0.6B/hf_online_dflash.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3-0.6B/hf_online_dflash.yaml
@@ -29,22 +29,17 @@ pipeline:
   task_0:
     script: common/specdec/dflash_online_training.sh
     args:
-      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
+      - --config modules/Model-Optimizer/modelopt_recipes/models/Qwen3-0.6B/dflash.yaml
       - model.model_name_or_path=<<global_vars.hf_model>>
       - data.data_path=modules/Model-Optimizer/examples/dataset/synthetic_conversations_1k.jsonl
-      - data.chat_template=examples/Qwen/Qwen3-0.6B/chat_template_train.jinja
+      - data.chat_template=modules/Model-Optimizer/modelopt_recipes/models/Qwen3-0.6B/chat_template_train.jinja
       - training.output_dir=/scratchspace/dflash_qwen3_0.6b
       - training.num_train_epochs=3
-      - training.training_seq_len=512
       - training.per_device_train_batch_size=2
       - training.save_steps=500
       - training.logging_steps=100
       - training.disable_tqdm=true
-      - training.answer_only_loss=true
-      - dflash.dflash_block_size=8
-      - dflash.dflash_mask_token_id=151669
       - dflash.dflash_use_torch_compile=False
-      - dflash.dflash_architecture_config.num_hidden_layers=2
     environment:
       - MAX_FINAL_LOSS: "2.0"
       - MIN_FINAL_ACC: "0.40"
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml
index 24068c4bb3e..115c7a24a69 100644
--- a/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml
@@ -67,11 +67,10 @@ pipeline:
   task_2:
     script: common/eagle3/train_eagle.sh
     args:
-      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - --config modules/Model-Optimizer/modelopt_recipes/models/Qwen3-8B/eagle3.yaml
       - model.model_name_or_path=<<global_vars.hf_model>>
       - data.offline_data_path=/scratchspace/offline_hidden_states
       - training.output_dir=/scratchspace/eagle3
-      - training.training_seq_len=4096
       - training.disable_tqdm=true
       - training.ar_validate_steps=500000
     slurm_config:
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_dflash.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_dflash.yaml
index 7c7f2a959dc..7105f3a7506 100644
--- a/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_dflash.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_dflash.yaml
@@ -35,23 +35,12 @@ pipeline:
   task_0:
     script: common/specdec/dflash_online_training.sh
     args:
-      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
+      - --config modules/Model-Optimizer/modelopt_recipes/models/Qwen3-8B/dflash.yaml
       - model.model_name_or_path=<<global_vars.hf_model>>
       - data.data_path=/hf-local/modelopt/Speculative-Decoding-Dataset-v1-Qwen3-8B/sample-100K-openai.jsonl
-      - data.chat_template=examples/Qwen/Qwen3-8B/chat_template_train.jinja
+      - data.chat_template=modules/Model-Optimizer/modelopt_recipes/models/Qwen3-8B/chat_template_train.jinja
       - training.output_dir=/scratchspace/dflash_bs16
-      - training.per_device_train_batch_size=1
       - training.num_train_epochs=1
-      - training.training_seq_len=4096
-      - training.save_steps=5000
-      - training.logging_steps=100
-      - training.disable_tqdm=true
-      - training.answer_only_loss=true
-      - dflash.dflash_block_size=16
-      - dflash.dflash_num_anchors=512
-      - dflash.dflash_loss_decay_factor=7
-      - dflash.dflash_mask_token_id=151669
-      - dflash.dflash_architecture_config.num_hidden_layers=5
     environment:
       - MAX_FINAL_LOSS: "5.0"
       - MIN_FINAL_ACC: "0.15"
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_eagle3.yaml
index 969a865f35f..723ad8f71a1 100644
--- a/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_eagle3.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_eagle3.yaml
@@ -36,11 +36,10 @@ pipeline:
   task_1:
     script: common/eagle3/train_eagle.sh
     args:
-      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - --config modules/Model-Optimizer/modelopt_recipes/models/Qwen3-8B/eagle3.yaml
       - model.model_name_or_path=<<global_vars.hf_model>>
       - data.data_path=/scratchspace/data/train.jsonl
       - training.output_dir=/scratchspace/eagle3
-      - training.training_seq_len=4096
       - training.disable_tqdm=true
       - training.ar_validate_steps=500000
       - training.num_train_epochs=1
diff --git a/tools/launcher/examples/Qwen/Qwen3.5-4B/hf_online_dflash.yaml b/tools/launcher/examples/Qwen/Qwen3.5-4B/hf_online_dflash.yaml
index 5d7e739157b..33e06dfded2 100644
--- a/tools/launcher/examples/Qwen/Qwen3.5-4B/hf_online_dflash.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3.5-4B/hf_online_dflash.yaml
@@ -5,9 +5,6 @@
 #   task_1: vLLM smoke test with DFlash speculative decoding
 #   task_2: HF AR evaluation (1 GPU)
 #
-# NOTE: Qwen3.5-4B has non-standard head_dim=160. The draft model uses z-lab's
-# architecture (32 heads, head_dim=128) for vLLM KV cache compatibility.
-#
 # Reference: "DFlash: Block Diffusion for Flash Speculative Decoding" (arXiv:2602.06036)
 #
 # Usage:
@@ -23,28 +20,13 @@ pipeline:
   task_0:
     script: common/specdec/dflash_online_training.sh
     args:
-      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
+      - --config modules/Model-Optimizer/modelopt_recipes/models/Qwen3.5-4B/dflash.yaml
       - model.model_name_or_path=<<global_vars.hf_model>>
-      - model.trust_remote_code=false
       - data.data_path=/hf-local/modelopt/Speculative-Decoding-Dataset-v1-Qwen3-8B/sample-1K-openai.jsonl
-      - data.chat_template=examples/Qwen/Qwen3.5-4B/chat_template_train.jinja
+      - data.chat_template=modules/Model-Optimizer/modelopt_recipes/models/Qwen3.5-4B/chat_template_train.jinja
       - training.output_dir=/scratchspace/dflash_qwen3.5_4b
       - training.num_train_epochs=1
-      - training.training_seq_len=4096
-      - training.save_steps=5000
       - training.logging_steps=1000
-      - training.disable_tqdm=true
-      - training.answer_only_loss=true
-      - dflash.dflash_block_size=8
-      - dflash.dflash_num_anchors=512
-      - dflash.dflash_loss_decay_factor=4
-      - dflash.dflash_mask_token_id=248070
-      - dflash.dflash_architecture_config.num_hidden_layers=5
-      - dflash.dflash_architecture_config.num_attention_heads=32
-      - dflash.dflash_architecture_config.num_key_value_heads=8
-      - dflash.dflash_architecture_config.head_dim=128
-      - dflash.dflash_architecture_config.intermediate_size=9728
-      - dflash.dflash_architecture_config.rope_theta=10000000
     environment:
     slurm_config:
       _factory_: "slurm_factory"