Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions .github/workflows/gpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,27 +30,34 @@ jobs:
tests/gpu/**
tests/gpu_megatron/**
tests/gpu_trtllm/**
tests/gpu_vllm/**

gpu-tests:
needs: [pr-gate]
if: needs.pr-gate.outputs.any_changed == 'true'
strategy:
fail-fast: false
matrix:
# ``container_image`` is the full image path so non-nvcr.io registries
# (e.g. docker.io/vllm) can be used alongside nvcr.io/nvidia images.
include:
- example: gpu
timeout: 75
container_image: pytorch:26.03-py3
container_image: nvcr.io/nvidia/pytorch:26.03-py3
- example: gpu_megatron
timeout: 45
container_image: nemo:26.04
container_image: nvcr.io/nvidia/nemo:26.04
- example: gpu_trtllm
timeout: 30
container_image: tensorrt-llm/release:1.3.0rc12
container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc12
- example: gpu_vllm
timeout: 30
# Keep in sync with examples/vllm_serve/Dockerfile.
Comment thread
kinjalpatel27 marked this conversation as resolved.
container_image: docker.io/vllm/vllm-openai:v0.20.0
runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
timeout-minutes: ${{ matrix.timeout }}
container:
image: nvcr.io/nvidia/${{ matrix.container_image }}
image: ${{ matrix.container_image }}
env:
GIT_DEPTH: 1000 # For correct version
PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
Expand All @@ -63,10 +70,11 @@ jobs:
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
- name: Run gpu tests
env:
COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
# Skip subprocess coverage for gpu_vllm — the hook deadlocks vLLM's engine-core IPC.
COVERAGE_PROCESS_START: ${{ matrix.example == 'gpu_vllm' && '' || format('{0}/pyproject.toml', github.workspace) }}
COVERAGE_FILE: ${{ github.workspace }}/.coverage
run: |
python -m pip install nox && nox -s ${{ matrix.example }}
python3 -m pip install nox && nox -s ${{ matrix.example }}
- name: Upload GPU coverage to Codecov
Comment thread
kinjalpatel27 marked this conversation as resolved.
uses: codecov/codecov-action@v5
with:
Expand Down
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ We use [pytest](https://docs.pytest.org/) for all tests. For any new features /
- `tests/gpu`: Fast GPU-based unit tests for the core ModelOpt library. In most cases, they should not take more than a few seconds to run.
- `tests/gpu_megatron`: Fast GPU-based unit tests for the core ModelOpt library for Megatron-Core features. In most cases, they should not take more than a few seconds to run.
- `tests/gpu_trtllm`: Fast GPU-based unit tests for the core ModelOpt library for TensorRT-LLM features. In most cases, they should not take more than a few seconds to run.
- `tests/gpu_vllm`: Fast GPU-based unit tests for the core ModelOpt library for vLLM features. In most cases, they should not take more than a few seconds to run.
- `tests/examples`: Integration tests for ModelOpt examples. They should not take more than a few minutes to run. Please refer to [example test README](./tests/examples/README.md) for more details.

For lightweight focused local validation, run `pytest` directly on the relevant test path. For example:
Expand Down
8 changes: 8 additions & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,14 @@ def gpu_trtllm(session):
session.run("python", "-m", "pytest", "tests/gpu_trtllm", *_cov_args())


# Container: docker.io/vllm/vllm-openai (the published image ships vLLM + CUDA + torch).
# Pin must stay in sync with examples/vllm_serve/Dockerfile.
Comment thread
kinjalpatel27 marked this conversation as resolved.
@nox.session(venv_backend="none")
def gpu_vllm(session):
session.run("python3", "-m", "pip", "install", "-e", ".[hf,dev-test]")
session.run("python3", "-m", "pytest", "tests/gpu_vllm", *_cov_args())

Comment thread
kinjalpatel27 marked this conversation as resolved.

# Container: nvcr.io/nvidia/pytorch:26.01-py3 or later
@nox.session(venv_backend="none")
def regression(session):
Expand Down
39 changes: 39 additions & 0 deletions tests/_test_utils/torch/transformers_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
AutoModelForQuestionAnswering,
AutoTokenizer,
BertConfig,
DeepseekV3Config,
GptOssConfig,
LlamaConfig,
PreTrainedModel,
Expand Down Expand Up @@ -120,6 +121,44 @@ def create_tiny_qwen3_moe_dir(
return qwen3_moe_dir


##### DeepSeek V3 #####
def get_tiny_deepseek_v3(**config_kwargs) -> PreTrainedModel:
set_seed(SEED)
kwargs = {
"dtype": torch.bfloat16,
"vocab_size": 128,
"hidden_size": 128,
"intermediate_size": 256,
"moe_intermediate_size": 64,
"num_hidden_layers": 2,
"num_attention_heads": 2,
"num_key_value_heads": 2,
"n_routed_experts": 4,
"num_experts_per_tok": 2,
"n_shared_experts": 1,
"first_k_dense_replace": 0,
"kv_lora_rank": 16,
"q_lora_rank": 32,
"qk_rope_head_dim": 16,
"qk_nope_head_dim": 16,
"v_head_dim": 16,
"max_position_embeddings": 128,
# Required so vLLM allocates ``gate.e_score_correction_bias`` (HF saves it unconditionally).
"topk_method": "noaux_tc",
}
kwargs.update(**config_kwargs)
cfg = DeepseekV3Config(**kwargs)
# Survive transformers versions that drop unknown kwargs from the dataclass.
cfg.topk_method = kwargs["topk_method"]
return AutoModelForCausalLM.from_config(cfg)


def create_tiny_deepseek_v3_dir(tmp_path: Path | str, **config_kwargs) -> Path:
Comment thread
kinjalpatel27 marked this conversation as resolved.
Outdated
deepseek_dir = Path(tmp_path) / "tiny_deepseek_v3"
get_tiny_deepseek_v3(**config_kwargs).save_pretrained(deepseek_dir)
return deepseek_dir


##### GPT-OSS #####
def get_tiny_gpt_oss(**config_kwargs) -> PreTrainedModel:
set_seed(SEED)
Expand Down
30 changes: 30 additions & 0 deletions tests/gpu_vllm/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Shared setup for vLLM tests.

vLLM handles its own distributed init, current-vllm-config context, and
parallel-state setup when ``LLM(...)`` is constructed, so this conftest only
opts into ``VLLM_ALLOW_INSECURE_SERIALIZATION=1`` *before* importing vLLM so
``LLM.collective_rpc(callable)`` can ship our worker callables over the engine
IPC channel via pickle. Without this, the default msgpack encoder rejects raw
functions and the call raises ``TypeError``. Only safe in a controlled test
environment.
"""

import os

# Must precede any ``import vllm``: the env is read at module-import time.
os.environ.setdefault("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
Loading
Loading