-
Notifications
You must be signed in to change notification settings - Fork 413
Expand file tree
/
Copy pathhf_offline_eagle3.yaml
More file actions
103 lines (97 loc) · 3.21 KB
/
hf_offline_eagle3.yaml
File metadata and controls
103 lines (97 loc) · 3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# EAGLE3 offline speculative decoding pipeline for Qwen3-8B.
#
# 4-step pipeline:
# task_0: Data synthesis — query TRT-LLM server to generate prompt samples
# task_1: Dump hidden states — run target model to capture hidden states
# task_2: Offline training — train the EAGLE3 draft head
# task_3: Benchmark — evaluate speculative decoding speedup via VLLM
#
# All tasks share /scratchspace to pass artifacts between steps.
#
# Usage:
# uv run launch.py --yaml examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml --yes
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml --yes
job_name: Qwen3-8B_EAGLE3_offline
pipeline:
allow_to_fail: false
skip: false
note:
global_vars:
hf_model: /hf-local/Qwen/Qwen3-8B
# Step 1: Data synthesis via TRT-LLM server
# Args before "--" go to trtllm-serve; args after "--" go to tools/query.py.
task_0:
script: common/tensorrt_llm/query.sh
args:
- --model <<global_vars.hf_model>>
- --tp_size 8
- --ep_size 8
- --max_num_tokens 32000
- --port 8000
- --host 0.0.0.0
- --trust_remote_code
- --
- --data /hf-local/modelopt/Speculative-Decoding-Prompt-Samples
- --save /scratchspace/data
environment:
- HF_LOCAL: /hf-local
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 8
gpus_per_node: 8
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
# Step 2: Dump hidden states from target model
task_1:
script: common/eagle3/dump_offline_data.sh
args:
- --input-data /scratchspace/data
- --output-dir /scratchspace/offline_hidden_states
- --max-seq-len 8192
- --tp 8
- --moe-ep 8
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 8
gpus_per_node: 8
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
# Step 3: Train EAGLE3 draft head (offline, single task)
task_2:
script: common/eagle3/train_eagle.sh
args:
- --config modules/Model-Optimizer/modelopt_recipes/models/Qwen3-8B/eagle3.yaml
- model.model_name_or_path=<<global_vars.hf_model>>
- data.offline_data_path=/scratchspace/offline_hidden_states
- training.output_dir=/scratchspace/eagle3
- training.disable_tqdm=true
- training.ar_validate_steps=500000
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 8
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
# Step 4: Benchmark speculative decoding (VLLM backend)
task_3:
script: common/specdec_bench/quick_check.sh
args:
- --draft_model_dir /scratchspace/export
- --draft_length 3
- --output_length 4096
- --engine VLLM
- --tp_size 8
- --ep_size 1
- --speculative_algorithm EAGLE3
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
- --concurrency 1
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 8
container: vllm/vllm-openai:latest