Skip to content

Try other test data #4414

Try other test data

Try other test data #4414

Workflow file for this run

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CICD NeMo
on:
schedule:
- cron: 0 0 * * *
push:
branches:
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
# Allow MCore to trigger this workflow remotely for compatibility testing
workflow_dispatch:
inputs:
mcore_commit:
description: 'MCore commit SHA to test against'
required: false
type: string
mcore_branch:
description: 'MCore branch name (for reference)'
required: false
type: string
mcore_repo:
description: 'MCore repository URL (for fetching from forks)'
required: false
type: string
default: 'https://github.com/NVIDIA/Megatron-LM.git'
test_suite:
description: 'Test suite to run'
required: false
type: choice
options:
- 'all'
- 'unit-only'
- 'functional-only'
default: 'all'
triggered_by:
description: 'Trigger source (for tracking)'
required: false
type: string
default: 'manual'
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
env:
container-registry: nemoci.azurecr.io
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
lint-check:
name: Lint check
runs-on: ubuntu-latest
needs: [pre-flight]
if: |
(
needs.pre-flight.outputs.is_deployment_workflow == 'false'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
) || (
needs.pre-flight.outputs.is_deployment_workflow == 'false'
&& needs.pre-flight.outputs.is_ci_workload == 'false'
&& needs.pre-flight.outputs.docs_only == 'false'
)
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
submodules: "recursive"
- name: Update MCore submodule (if triggered from MCore)
if: ${{ github.event.inputs.mcore_commit != '' }}
run: |
echo "🔄 Updating MCore submodule to commit: ${{ github.event.inputs.mcore_commit }}"
echo "📌 MCore branch: ${{ github.event.inputs.mcore_branch || 'unknown' }}"
echo "📍 MCore repo: ${{ github.event.inputs.mcore_repo || 'https://github.com/NVIDIA/Megatron-LM.git' }}"
echo "🎯 Triggered by: ${{ github.event.inputs.triggered_by }}"
cd 3rdparty/Megatron-LM
git fetch ${{ github.event.inputs.mcore_repo || 'origin' }} ${{ github.event.inputs.mcore_commit }}
git checkout ${{ github.event.inputs.mcore_commit }}
# Verify the checkout was successful
ACTUAL_COMMIT=$(git rev-parse HEAD)
EXPECTED_COMMIT="${{ github.event.inputs.mcore_commit }}"
echo "✅ MCore submodule updated successfully"
echo "Expected: ${EXPECTED_COMMIT}"
echo "Actual: ${ACTUAL_COMMIT}"
if [ "${ACTUAL_COMMIT}" != "${EXPECTED_COMMIT}" ]; then
echo "❌ ERROR: MCore commit mismatch!"
exit 1
fi
git log -1 --oneline
cd ../..
- name: Set environment for MCore testing
if: ${{ github.event.inputs.mcore_commit != '' }}
run: |
echo "MCORE_TRIGGERED_TESTING=true" | tee -a "$GITHUB_ENV"
echo "⚙️ MCore testing mode: skipping --locked flag because lockfile was generated with different MCore version"
- name: Check lint
run: |
pip install pre-commit==3.6.0
pre-commit install
pre-commit run --all-files --show-diff-on-failure --color=always
cicd-wait-in-queue:
needs: [pre-flight, lint-check]
runs-on: ubuntu-latest
environment: test
if: |
!(needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| needs.pre-flight.outputs.docs_only == 'true')
steps:
- name: Running CI tests
run: |
echo "Running CI tests"
cicd-container-build:
needs: [pre-flight, cicd-wait-in-queue]
runs-on: nemo-ci-aws-gpu-x2
environment: nemo-ci
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
- name: Update MCore submodule (if triggered from MCore)
if: ${{ github.event.inputs.mcore_commit != '' }}
run: |
echo "🔄 Updating MCore submodule to commit: ${{ github.event.inputs.mcore_commit }}"
echo "📌 MCore branch: ${{ github.event.inputs.mcore_branch || 'unknown' }}"
echo "📍 MCore repo: ${{ github.event.inputs.mcore_repo || 'https://github.com/NVIDIA/Megatron-LM.git' }}"
echo "🎯 Triggered by: ${{ github.event.inputs.triggered_by }}"
cd 3rdparty/Megatron-LM
git fetch ${{ github.event.inputs.mcore_repo || 'origin' }} ${{ github.event.inputs.mcore_commit }}
git checkout ${{ github.event.inputs.mcore_commit }}
# Verify the checkout was successful
ACTUAL_COMMIT=$(git rev-parse HEAD)
EXPECTED_COMMIT="${{ github.event.inputs.mcore_commit }}"
echo "✅ MCore submodule updated successfully"
echo "Expected: ${EXPECTED_COMMIT}"
echo "Actual: ${ACTUAL_COMMIT}"
if [ "${ACTUAL_COMMIT}" != "${EXPECTED_COMMIT}" ]; then
echo "❌ ERROR: MCore commit mismatch!"
exit 1
fi
git log -1 --pretty=format:"📝 Commit: %H%n👤 Author: %an%n📅 Date: %ad%n💬 Message: %s" --date=short
cd ../..
# Store for Docker build arg
echo "MCORE_COMMIT_SHA=${EXPECTED_COMMIT}" | tee -a "$GITHUB_ENV"
- name: Set environment for MCore testing
if: ${{ github.event.inputs.mcore_commit != '' }}
run: |
echo "MCORE_TRIGGERED_TESTING=true" | tee -a "$GITHUB_ENV"
echo "⚙️ MCore testing mode: skipping --locked flag because lockfile was generated with different MCore version"
- name: Setup python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Install Azure CLI
shell: bash
run: |
echo "::group::Install Azure CLI"
# Create systemd override for proper dependencies
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
echo "::endgroup::"
- name: Azure Login
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: Azure ACR Login
shell: bash
run: |
az acr login --name nemoci
- name: Install GH CLI
shell: bash
run: |
apt-get update
apt-get install -y gh
- name: Get last merged PR
id: cache_from
env:
GH_TOKEN: ${{ github.token }}
run: |
LAST_PRS=$(gh api graphql -f query='
query {
repository(owner: "NVIDIA-NeMo", name: "Megatron-Bridge") {
pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
nodes {
number
}
}
}
}' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
echo "type=registry,ref=${{ env.container-registry }}/megatron-bridge:$number-buildcache,mode=max"
done)
echo "LAST_PRS<<EOF" | tee -a $GITHUB_OUTPUT
echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
echo "EOF" | tee -a $GITHUB_OUTPUT
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build and push
uses: docker/build-push-action@v5
with:
file: ./docker/Dockerfile.ci
push: true
context: .
build-args: |
FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:25.09-py3
MCORE_TRIGGERED_TESTING=${{ env.MCORE_TRIGGERED_TESTING || 'false' }}
MCORE_COMMIT_SHA=${{ env.MCORE_COMMIT_SHA || 'unknown' }}
cache-from: |
type=registry,ref=${{ env.container-registry }}/megatron-bridge:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
type=registry,ref=${{ env.container-registry }}/megatron-bridge:main-buildcache,mode=max
${{ steps.cache_from.outputs.LAST_PRS }}
cache-to: |
type=registry,ref=${{ env.container-registry }}/megatron-bridge:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
no-cache: false
tags: |
${{ env.container-registry }}/megatron-bridge:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}
${{ env.container-registry }}/megatron-bridge:${{ github.sha }}
secrets: |
GH_TOKEN=${{ secrets.PAT }}
- name: Verify MCore commit in Docker image
if: ${{ github.event.inputs.mcore_commit != '' }}
run: |
echo "🔍 Verifying MCore commit inside Docker image..."
EXPECTED="${{ env.MCORE_COMMIT_SHA }}"
# Get last line first (the SHA), then trim whitespace
ACTUAL=$(docker run --rm ${{ env.container-registry }}/megatron-bridge:${{ github.sha }} \
cat /opt/Megatron-Bridge/.mcore_commit_sha 2>&1 | tail -1)
ACTUAL=$(echo "${ACTUAL}" | tr -d '\n\r ')
# Trim whitespace from expected value too
EXPECTED=$(echo "${EXPECTED}" | tr -d '\n\r ')
echo "Expected MCore commit: ${EXPECTED}"
echo "Actual MCore commit: ${ACTUAL}"
if [ "${ACTUAL}" = "${EXPECTED}" ]; then
echo "✅ MCore commit verified in Docker image!"
else
echo "❌ ERROR: MCore commit mismatch in Docker image! The build did not use the correct MCore version."
echo "Expected length: ${#EXPECTED}"
echo "Actual length: ${#ACTUAL}"
exit 1
fi
cicd-unit-tests:
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
&& (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'unit-only')
needs: [pre-flight, cicd-wait-in-queue, cicd-container-build]
runs-on: nemo-ci-aws-gpu-x2
name: Launch_Unit_Tests
environment: nemo-ci
env:
TRANSFORMERS_OFFLINE: "1"
HF_HUB_OFFLINE: "1"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
- name: Update MCore submodule (if triggered from MCore)
if: ${{ github.event.inputs.mcore_commit != '' }}
run: |
echo "🔄 Updating MCore submodule for unit tests..."
cd 3rdparty/Megatron-LM
git fetch ${{ github.event.inputs.mcore_repo || 'origin' }} ${{ github.event.inputs.mcore_commit }}
git checkout ${{ github.event.inputs.mcore_commit }}
ACTUAL_COMMIT=$(git rev-parse HEAD)
EXPECTED_COMMIT="${{ github.event.inputs.mcore_commit }}"
echo "🧪 UNIT TESTS - MCore verification:"
echo "Expected: ${EXPECTED_COMMIT}"
echo "Actual: ${ACTUAL_COMMIT}"
if [ "${ACTUAL_COMMIT}" != "${EXPECTED_COMMIT}" ]; then
echo "❌ ERROR: MCore commit mismatch!"
exit 1
fi
echo "✅ MCore commit verified"
echo "📦 Container: ${{ env.container-registry }}/megatron-bridge:${{ github.sha }}"
git log -1 --oneline
- name: main
uses: ./.github/actions/test-template
with:
script: Launch_Unit_Tests
timeout: 15
is_unit_test: "true"
has-azure-credentials: "true"
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry }}/megatron-bridge:${{ github.sha }}
cicd-functional-tests:
strategy:
fail-fast: false
max-parallel: 5
matrix:
include:
- script: L2_Launch_training
timeout: 40
- script: L2_Launch_converter
- script: L2_Launch_models_deepseek
- script: L2_Launch_models_gemma
- script: L2_Launch_models_gemma_vl
- script: L2_Launch_models_glm
- script: L2_Launch_models_gpt_oss
- script: L2_Launch_models_llama_nemotron
- script: L2_Launch_models_mistral
- script: L2_Launch_models_nemotron
- script: L2_Launch_models_nemotronh
- script: L2_Launch_models_nemotron_vl
- script: L2_Launch_models_olmoe
- script: L2_Launch_models_qwen
- script: L2_Launch_models_qwen_quantization
- script: L2_Launch_models_qwen_vl
- script: L2_Launch_recipes_llama_1b
- script: L2_Launch_recipes_llama_3b
- script: L2_Launch_recipes_llama_distill
- script: L2_Launch_recipes_mamba
- script: L2_Launch_recipes_nemotronh
- script: L2_Launch_recipes_qwen
- script: L2_Launch_data
- script: L2_Launch_post_training_quantization
- script: L2_Launch_quantization_aware_training
- script: L2_Launch_quantization_export
needs: [cicd-unit-tests]
runs-on: nemo-ci-aws-gpu-x2
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
&& (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only')
name: ${{ matrix.script }}
environment: nemo-ci
env:
HF_HOME: /home/TestData/HF_HOME
TRANSFORMERS_OFFLINE: "1"
HF_HUB_OFFLINE: "1"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
- name: Update MCore submodule (if triggered from MCore)
if: ${{ github.event.inputs.mcore_commit != '' }}
run: |
echo "🔄 Updating MCore submodule for functional tests..."
cd 3rdparty/Megatron-LM
git fetch ${{ github.event.inputs.mcore_repo || 'origin' }} ${{ github.event.inputs.mcore_commit }}
git checkout ${{ github.event.inputs.mcore_commit }}
ACTUAL_COMMIT=$(git rev-parse HEAD)
EXPECTED_COMMIT="${{ github.event.inputs.mcore_commit }}"
echo "🧪 FUNCTIONAL TESTS - MCore verification:"
echo "Expected: ${EXPECTED_COMMIT}"
echo "Actual: ${ACTUAL_COMMIT}"
if [ "${ACTUAL_COMMIT}" != "${EXPECTED_COMMIT}" ]; then
echo "❌ ERROR: MCore commit mismatch!"
exit 1
fi
echo "✅ MCore commit verified"
echo "📦 Container: ${{ env.container-registry }}/megatron-bridge:${{ github.sha }}"
git log -1 --oneline
- name: main
uses: ./.github/actions/test-template
with:
script: ${{ matrix.script }}
timeout: ${{ matrix.timeout || 30 }}
is_unit_test: "false"
has-azure-credentials: "true"
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry }}/megatron-bridge:${{ github.sha }}
Nemo_CICD_Test:
needs:
- pre-flight
- cicd-unit-tests
- cicd-functional-tests
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| always()
)
&& !cancelled()
runs-on: ubuntu-latest
permissions: write-all
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Get workflow result
id: result
shell: bash -x -e -u -o pipefail {0}
env:
GH_TOKEN: ${{ github.token }}
RUN_ID: ${{ github.run_id }}
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi
Coverage_Fake:
runs-on: ubuntu-latest
needs: [Nemo_CICD_Test, pre-flight]
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
)
&& needs.pre-flight.outputs.is_ci_workload == 'false'
&& !cancelled()
environment: nemo-ci
steps:
- name: Generate fake coverage report
uses: actions/github-script@v6
with:
github-token: ${{ secrets.PAT }}
script: |
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: context.sha,
state: 'success',
description: 'No code changes - coverage check skipped',
context: 'codecov/patch'
});
Coverage:
runs-on: ubuntu-latest
needs: [Nemo_CICD_Test, pre-flight]
if: |
(
(needs.pre-flight.outputs.is_ci_workload == 'true' && !failure())
|| success()
)
&& !cancelled()
strategy:
matrix:
flag:
- unit-test
- e2e
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download coverage reports of current branch
uses: actions/download-artifact@v4
with:
pattern: coverage-${{ matrix.flag }}-*
- name: Get total coverage of current branch
shell: bash -x -e -u -o pipefail {0}
if: always()
run: |
pip install coverage[toml]
ls -al .
ls -al coverage-*/
coverage combine --keep $(ls coverage-*/.coverage)
coverage report -i
rm -rf coverage-*
ls -al
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
flags: ${{ matrix.flag }}
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: coverage-${{ matrix.flag }}-aggregated
path: |
.coverage
include-hidden-files: true