diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 724031eafce..2f073180d65 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -30,6 +30,7 @@ jobs: tests/gpu/** tests/gpu_megatron/** tests/gpu_trtllm/** + tests/gpu_vllm/** gpu-tests: needs: [pr-gate] @@ -37,20 +38,26 @@ jobs: strategy: fail-fast: false matrix: + # ``container_image`` is the full image path so non-nvcr.io registries + # (e.g. docker.io/vllm) can be used alongside nvcr.io/nvidia images. include: - example: gpu timeout: 75 - container_image: pytorch:26.03-py3 + container_image: nvcr.io/nvidia/pytorch:26.03-py3 - example: gpu_megatron timeout: 45 - container_image: nemo:26.04 + container_image: nvcr.io/nvidia/nemo:26.04 - example: gpu_trtllm timeout: 30 - container_image: tensorrt-llm/release:1.3.0rc12 + container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc12 + - example: gpu_vllm + timeout: 30 + # Keep in sync with examples/vllm_serve/Dockerfile. + container_image: docker.io/vllm/vllm-openai:v0.20.0 runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }} timeout-minutes: ${{ matrix.timeout }} container: - image: nvcr.io/nvidia/${{ matrix.container_image }} + image: ${{ matrix.container_image }} env: GIT_DEPTH: 1000 # For correct version PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages @@ -63,11 +70,15 @@ jobs: echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV - name: Run gpu tests env: - COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml + # Skip subprocess coverage for gpu_vllm — the hook deadlocks vLLM's engine-core IPC. + COVERAGE_PROCESS_START: ${{ matrix.example == 'gpu_vllm' && '' || format('{0}/pyproject.toml', github.workspace) }} COVERAGE_FILE: ${{ github.workspace }}/.coverage run: | - python -m pip install nox && nox -s ${{ matrix.example }} + python3 -m pip install nox && nox -s ${{ matrix.example }} - name: Upload GPU coverage to Codecov + # vLLM container has no ``git``, which codecov-action needs; gpu_vllm + # also runs without ``--cov`` so there's no coverage.xml to upload. + if: matrix.example != 'gpu_vllm' uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bcc70b64d65..541ab6a51d4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -146,6 +146,7 @@ We use [pytest](https://docs.pytest.org/) for all tests. For any new features / - `tests/gpu`: Fast GPU-based unit tests for the core ModelOpt library. In most cases, they should not take more than a few seconds to run. - `tests/gpu_megatron`: Fast GPU-based unit tests for the core ModelOpt library for Megatron-Core features. In most cases, they should not take more than a few seconds to run. - `tests/gpu_trtllm`: Fast GPU-based unit tests for the core ModelOpt library for TensorRT-LLM features. In most cases, they should not take more than a few seconds to run. +- `tests/gpu_vllm`: Fast GPU-based unit tests for the core ModelOpt library for vLLM features. In most cases, they should not take more than a few seconds to run. - `tests/examples`: Integration tests for ModelOpt examples. They should not take more than a few minutes to run. Please refer to [example test README](./tests/examples/README.md) for more details. For lightweight focused local validation, run `pytest` directly on the relevant test path. For example: diff --git a/examples/vllm_serve/Dockerfile b/examples/vllm_serve/Dockerfile index 352896ca2cc..7213c6fc430 100644 --- a/examples/vllm_serve/Dockerfile +++ b/examples/vllm_serve/Dockerfile @@ -1,4 +1,4 @@ -FROM vllm/vllm-openai:v0.10.2 +FROM vllm/vllm-openai:v0.20.0 # Set environment variables ENV PIP_NO_CACHE_DIR=off \ @@ -23,17 +23,11 @@ RUN cd Model-Optimizer && \ pip install -e ".[all,dev-test]" # Llama4 requires this -RUN pip install flash-attn==2.7.4.post1 +RUN pip install flash-attn==2.7.4.post1 --no-build-isolation # Pre-compile CUDA extensions to avoid compilation time during runtime RUN python3 -c "import modelopt.torch.quantization.extensions as ext; ext.precompile()" || true -# Install requirements from examples (excluding windows examples) -RUN find Model-Optimizer/examples -name "requirements.txt" | grep -v "windows" | while read req_file; do \ - echo "Installing from $req_file"; \ - pip install -r "$req_file" || echo "Warning: Failed to install from $req_file"; \ - done - # Allow users to run without root RUN chmod -R 777 /workspace diff --git a/noxfile.py b/noxfile.py index 3ce01e909f9..a971d795ada 100644 --- a/noxfile.py +++ b/noxfile.py @@ -135,6 +135,14 @@ def gpu_trtllm(session): session.run("python", "-m", "pytest", "tests/gpu_trtllm", *_cov_args()) +# Container: docker.io/vllm/vllm-openai (the published image ships vLLM + CUDA + torch). +# Pin must stay in sync with examples/vllm_serve/Dockerfile. +@nox.session(venv_backend="none") +def gpu_vllm(session): + session.run("python3", "-m", "pip", "install", "-e", ".[hf,dev-test]") + session.run("python3", "-m", "pytest", "tests/gpu_vllm", *_cov_args()) + + # Container: nvcr.io/nvidia/pytorch:26.01-py3 or later @nox.session(venv_backend="none") def regression(session): diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py index 34bc96cd0ae..0365f07ffa8 100644 --- a/tests/_test_utils/torch/transformers_models.py +++ b/tests/_test_utils/torch/transformers_models.py @@ -26,6 +26,7 @@ AutoModelForQuestionAnswering, AutoTokenizer, BertConfig, + DeepseekV3Config, GptOssConfig, LlamaConfig, PreTrainedModel, @@ -120,6 +121,50 @@ def create_tiny_qwen3_moe_dir( return qwen3_moe_dir +##### DeepSeek V3 ##### +def get_tiny_deepseek_v3(**config_kwargs) -> PreTrainedModel: + set_seed(SEED) + kwargs = { + "dtype": torch.bfloat16, + "vocab_size": 128, + "hidden_size": 128, + "intermediate_size": 256, + "moe_intermediate_size": 64, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "num_key_value_heads": 2, + "n_routed_experts": 4, + "num_experts_per_tok": 2, + "n_shared_experts": 1, + "first_k_dense_replace": 0, + "kv_lora_rank": 16, + "q_lora_rank": 32, + "qk_rope_head_dim": 16, + "qk_nope_head_dim": 16, + "v_head_dim": 16, + "max_position_embeddings": 128, + # Required so vLLM allocates ``gate.e_score_correction_bias`` (HF saves it unconditionally). + "topk_method": "noaux_tc", + } + kwargs.update(**config_kwargs) + cfg = DeepseekV3Config(**kwargs) + # Survive transformers versions that drop unknown kwargs from the dataclass. + cfg.topk_method = kwargs["topk_method"] + return AutoModelForCausalLM.from_config(cfg) + + +def create_tiny_deepseek_v3_dir( + tmp_path: Path | str, with_tokenizer: bool = False, **config_kwargs +) -> Path: + deepseek_dir = Path(tmp_path) / "tiny_deepseek_v3" + if with_tokenizer: + tokenizer = get_tiny_tokenizer() + tokenizer.save_pretrained(deepseek_dir) + config_kwargs["vocab_size"] = tokenizer.vocab_size + get_tiny_deepseek_v3(**config_kwargs).save_pretrained(deepseek_dir) + return deepseek_dir + + ##### GPT-OSS ##### def get_tiny_gpt_oss(**config_kwargs) -> PreTrainedModel: set_seed(SEED) diff --git a/tests/gpu_vllm/conftest.py b/tests/gpu_vllm/conftest.py new file mode 100644 index 00000000000..8b4e966e987 --- /dev/null +++ b/tests/gpu_vllm/conftest.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Set ``VLLM_ALLOW_INSECURE_SERIALIZATION=1`` before vLLM is imported so +``LLM.collective_rpc(callable)`` can pickle worker callables. pytest loads +conftests before sibling test modules, so this beats the top-level +``from vllm import LLM`` in ``test_*.py``. +""" + +import os + +os.environ.setdefault("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") diff --git a/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py b/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py new file mode 100644 index 00000000000..2136f959754 --- /dev/null +++ b/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py @@ -0,0 +1,274 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""End-to-end tests for the vLLM fakequant dynamic modules. + +Boots ``vllm.LLM`` on tiny HF models (saved via +``_test_utils.torch.transformers_models``) and runs ``mtq.quantize`` inside the +worker via ``LLM.collective_rpc``. Asserts every ``_QuantVLLM…`` class is +installed and every enabled quantizer ends up with a registered tensor-level +``_amax`` after calibration. Mirrors the +``examples/vllm_serve/fakequant_worker.py`` production path. + +Architectures: TinyLlama (Linear + Attention), TinyQwen3MoE (+ FusedMoE), +TinyDeepseekV3 (+ MLAAttention). +""" + +from __future__ import annotations + +import gc + +import pytest +from _test_utils.torch.transformers_models import ( + create_tiny_deepseek_v3_dir, + create_tiny_llama_dir, + create_tiny_qwen3_moe_dir, +) +from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory + +import modelopt.torch.quantization as mtq +from modelopt.torch.quantization.nn import TensorQuantizer +from modelopt.torch.quantization.plugins.vllm import ( + _ATTENTION_TYPES, + VllmMLAAttention, + _QuantFusedMoEBase, + _VLLMParallelLinear, + disable_compilation, +) + + +def _quantize_and_summarize(self): + """Run on the worker via ``LLM.collective_rpc``. + + Module-level so it survives pickle over engine-core IPC. ``self`` is the + vLLM worker — needed to drive ``model_runner._dummy_run`` from the + calibration forward_loop. Returns a JSON-able summary. + """ + model = self.get_model() + + def _forward_loop(_model): + # ``num_tokens=1`` is enough for the ``"max"`` calibrator. + self.model_runner._dummy_run(1) + + with disable_compilation(model): + mtq.quantize(model, mtq.NVFP4_DEFAULT_CFG, forward_loop=_forward_loop) + + parallel_linear_counts: dict[str, int] = {} + moe_count = 0 + attention_count = 0 + mla_count = 0 + missing_quantizers: list[str] = [] + quantizers_without_amax: list[str] = [] + enabled_quantizer_count = 0 + + def _missing(module, name, slots): + return ( + f"{name}.{slot}" + for slot in slots + if not isinstance(getattr(module, slot, None), TensorQuantizer) + ) + + for name, module in model.named_modules(): + if isinstance(module, _VLLMParallelLinear): + kind = type(module).__name__ + parallel_linear_counts[kind] = parallel_linear_counts.get(kind, 0) + 1 + missing_quantizers.extend( + _missing(module, name, ("input_quantizer", "weight_quantizer", "output_quantizer")) + ) + elif isinstance(module, _QuantFusedMoEBase): + moe_count += 1 + missing_quantizers.extend( + _missing( + module, + name, + ( + "w13_input_quantizer", + "w2_input_quantizer", + "w13_weight_quantizer", + "w2_weight_quantizer", + ), + ) + ) + elif VllmMLAAttention is not None and isinstance(module, VllmMLAAttention): + mla_count += 1 + missing_quantizers.extend( + _missing( + module, name, ("q_bmm_quantizer", "kv_c_bmm_quantizer", "k_pe_bmm_quantizer") + ) + ) + elif isinstance(module, _ATTENTION_TYPES): + attention_count += 1 + missing_quantizers.extend( + _missing(module, name, ("q_bmm_quantizer", "k_bmm_quantizer", "v_bmm_quantizer")) + ) + + # Static-amax invariant: every enabled quantizer must own an ``_amax`` + # after calibration. ``kv_b_proj`` is exempt — vLLM's MLA decode path + # reads its weight directly and never calls its forward. + if isinstance(module, TensorQuantizer) and module.is_enabled: + enabled_quantizer_count += 1 + if not hasattr(module, "_amax") and "kv_b_proj" not in name: + quantizers_without_amax.append(name) + + return { + "parallel_linear_counts": parallel_linear_counts, + "moe_count": moe_count, + "attention_count": attention_count, + "mla_count": mla_count, + "missing_quantizers": missing_quantizers, + "quantizers_without_amax": quantizers_without_amax, + "enabled_quantizer_count": enabled_quantizer_count, + } + + +def _boot_llm(model_dir, **extra): + """Construct a vLLM engine on a tiny model. + + MoE fixtures override with ``moe_backend="triton"`` (pins the Triton + experts kernel whose module-level entries the modelopt plugin patches — + FlashInfer/TRTLLM kernels bypass them) and ``enable_expert_parallel=True`` + (keeps modelopt's MoE-specific calibration paths live). + """ + return LLM( + model=str(model_dir), + enforce_eager=True, + gpu_memory_utilization=0.2, + max_model_len=64, + max_num_seqs=1, + dtype="bfloat16", + skip_tokenizer_init=True, + **extra, + ) + + +def _shutdown_llm(llm): + del llm + gc.collect() + cleanup_dist_env_and_memory(shutdown_ray=False) + + +@pytest.fixture(scope="module") +def tiny_llama_llm(tmp_path_factory): + tmp = tmp_path_factory.mktemp("tiny_llama") + # Helper default ``max_position_embeddings=32`` would clash with vLLM's + # ``max_model_len=64`` set in ``_boot_llm``. + model_dir = create_tiny_llama_dir(tmp, max_position_embeddings=64) + llm = _boot_llm(model_dir) + try: + yield llm + finally: + _shutdown_llm(llm) + + +@pytest.fixture(scope="module") +def tiny_qwen3_moe_llm(tmp_path_factory): + tmp = tmp_path_factory.mktemp("tiny_qwen3_moe") + # head_dim=64 with num_heads=2 is broadly supported by vLLM's attention backends. + model_dir = create_tiny_qwen3_moe_dir( + tmp, + hidden_size=128, + intermediate_size=256, + moe_intermediate_size=64, + num_hidden_layers=2, + num_attention_heads=2, + num_key_value_heads=1, + max_position_embeddings=128, + vocab_size=128, + head_dim=64, + num_experts=4, + num_experts_per_tok=2, + decoder_sparse_step=1, + ) + llm = _boot_llm(model_dir, moe_backend="triton", enable_expert_parallel=True) + try: + yield llm + finally: + _shutdown_llm(llm) + + +@pytest.fixture(scope="module") +def tiny_deepseek_llm(tmp_path_factory): + tmp = tmp_path_factory.mktemp("tiny_deepseek") + model_dir = create_tiny_deepseek_v3_dir(tmp) + llm = _boot_llm(model_dir, moe_backend="triton", enable_expert_parallel=True) + try: + yield llm + finally: + _shutdown_llm(llm) + + +def _assert_quantizer_amax_is_static(summary): + """Every enabled quantizer must own a registered ``_amax`` after + calibration. Missing ``_amax`` → repr ``amax=dynamic`` → regression. + """ + assert summary["enabled_quantizer_count"] > 0, summary + assert summary["quantizers_without_amax"] == [], summary["quantizers_without_amax"] + + +def test_tiny_llama_quantize(tiny_llama_llm): + """Covers QKV/Row/MergedColumn ParallelLinear + Attention on a dense Llama.""" + summaries = tiny_llama_llm.collective_rpc(_quantize_and_summarize) + summary = summaries[0] + + assert summary["missing_quantizers"] == [], summary["missing_quantizers"] + + parallel_linear_counts = summary["parallel_linear_counts"] + # Each decoder layer contributes one of each. With num_hidden_layers=2: + assert parallel_linear_counts.get("QuantQKVParallelLinear", 0) >= 2, parallel_linear_counts + # o_proj + down_proj per layer + assert parallel_linear_counts.get("QuantRowParallelLinear", 0) >= 4, parallel_linear_counts + assert parallel_linear_counts.get("QuantMergedColumnParallelLinear", 0) >= 2, ( + parallel_linear_counts + ) + + # Llama uses the base Attention type — one per decoder layer. + assert summary["attention_count"] >= 2, summary + + # No MoE in a dense Llama. + assert summary["moe_count"] == 0 + + _assert_quantizer_amax_is_static(summary) + + +def test_tiny_qwen3_moe_quantize(tiny_qwen3_moe_llm): + """Tiny Qwen3-MoE adds FusedMoE coverage on top of the dense linears.""" + summaries = tiny_qwen3_moe_llm.collective_rpc(_quantize_and_summarize) + summary = summaries[0] + + assert summary["missing_quantizers"] == [], summary["missing_quantizers"] + + parallel_linear_counts = summary["parallel_linear_counts"] + assert parallel_linear_counts.get("QuantQKVParallelLinear", 0) >= 2, parallel_linear_counts + assert parallel_linear_counts.get("QuantRowParallelLinear", 0) >= 2, parallel_linear_counts + + # decoder_sparse_step=1 → every layer is MoE. With 2 layers we expect ≥2 FusedMoE. + assert summary["moe_count"] >= 2, summary + assert summary["attention_count"] >= 2, summary + + _assert_quantizer_amax_is_static(summary) + + +def test_tiny_deepseek_mla_quantize(tiny_deepseek_llm): + """Tiny DeepSeek-V3 covers MLAAttention (and again FusedMoE).""" + summaries = tiny_deepseek_llm.collective_rpc(_quantize_and_summarize) + summary = summaries[0] + + assert summary["missing_quantizers"] == [], summary["missing_quantizers"] + assert summary["mla_count"] >= 2, summary + # ``first_k_dense_replace=0`` → every layer is MoE. + assert summary["moe_count"] >= 2, summary + + _assert_quantizer_amax_is_static(summary)