Model-Optimizer/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml at 672473f98e387a6aee92573e71ea62871c121b61 · NVIDIA/Model-Optimizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# EAGLE3 offline speculative decoding pipeline for Qwen3-8B.
#
# 4-step pipeline:
#   task_0: Data synthesis — query TRT-LLM server to generate prompt samples
#   task_1: Dump hidden states — run target model to capture hidden states
#   task_2: Offline training — train the EAGLE3 draft head
#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
#
# All tasks share /scratchspace to pass artifacts between steps.
#
# Usage:
#   uv run launch.py --yaml examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml --yes
#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml --yes

job_name: Qwen3-8B_EAGLE3_offline
pipeline:
  allow_to_fail: false
  skip: false
  note:

  global_vars:
    hf_model: /hf-local/Qwen/Qwen3-8B

  # Step 1: Data synthesis via TRT-LLM server
  # Args before "--" go to trtllm-serve; args after "--" go to tools/query.py.
  task_0:
    script: common/tensorrt_llm/query.sh
    args:
      - --model <<global_vars.hf_model>>
      - --tp_size 8
      - --ep_size 8
      - --max_num_tokens 32000
      - --port 8000
      - --host 0.0.0.0
      - --trust_remote_code
      - --
      - --data /hf-local/modelopt/Speculative-Decoding-Prompt-Samples
      - --save /scratchspace/data
    environment:
      - HF_LOCAL: /hf-local
    slurm_config:
      _factory_: "slurm_factory"
      nodes: 1
      ntasks_per_node: 8
      gpus_per_node: 8
      container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0

  # Step 2: Dump hidden states from target model
  task_1:
    script: common/eagle3/dump_offline_data.sh
    args:
      - --input-data /scratchspace/data
      - --output-dir /scratchspace/offline_hidden_states
      - --max-seq-len 8192
      - --tp 8
      - --moe-ep 8
    environment:
      - HF_MODEL_CKPT: <<global_vars.hf_model>>
    slurm_config:
      _factory_: "slurm_factory"
      nodes: 1
      ntasks_per_node: 8
      gpus_per_node: 8
      container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0

  # Step 3: Train EAGLE3 draft head (offline, single task)
  task_2:
    script: common/eagle3/train_eagle.sh
    args:
      - --config modules/Model-Optimizer/modelopt_recipes/models/Qwen3-8B/eagle3.yaml
      - model.model_name_or_path=<<global_vars.hf_model>>
      - data.offline_data_path=/scratchspace/offline_hidden_states
      - training.output_dir=/scratchspace/eagle3
      - training.disable_tqdm=true
      - training.ar_validate_steps=500000
    slurm_config:
      _factory_: "slurm_factory"
      nodes: 1
      ntasks_per_node: 1
      gpus_per_node: 8
      container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0

  # Step 4: Benchmark speculative decoding (VLLM backend)
  task_3:
    script: common/specdec_bench/quick_check.sh
    args:
      - --draft_model_dir /scratchspace/export
      - --draft_length 3
      - --output_length 4096
      - --engine VLLM
      - --tp_size 8
      - --ep_size 1
      - --speculative_algorithm EAGLE3
      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
      - --concurrency 1
    environment:
      - HF_MODEL_CKPT: <<global_vars.hf_model>>
    slurm_config:
      _factory_: "slurm_factory"
      nodes: 1
      ntasks_per_node: 1
      gpus_per_node: 8
      container: vllm/vllm-openai:latest