NVIDIA-NeMo · ko3n1g · Dec 9, 2025 · Dec 2, 2025 · Dec 4, 2025 · Dec 5, 2025
diff --git a/src/megatron/bridge/training/utils/flop_utils.py b/src/megatron/bridge/training/utils/flop_utils.py
@@ -321,31 +321,35 @@ def transformer_flops():
 
     # Main entrypoint for FLOPs calculation.
     if getattr(cfg.model, "is_hybrid_model", False):
-        # TODO: Fix this when onboarding hybrid models
         # Calculate the number of each type of layer.
-        # num_attn_layers, num_mamba_layers, num_mlp_layers = calculate_layer_counts()
+        num_attn_layers, num_mamba_layers, num_mlp_layers = calculate_layer_counts()
+        padded_vocab_size = calculate_padded_vocab_size(
+            cfg.model.vocab_size,
+            cfg.model.make_vocab_size_divisible_by,
+            cfg.model.tensor_model_parallel_size,
+            logging_enabled=False,
+        )
 
-        # # Compute hybrid model FLOPs.
-        # return hybrid_flops(
-        #     batch_size=batch_size,
-        #     seq_len=cfg.model.seq_length,
-        #     hidden_size=cfg.model.hidden_size,
-        #     num_attn_layers=num_attn_layers,
-        #     num_mamba_layers=num_mamba_layers,
-        #     num_mlp_layers=num_mlp_layers,
-        #     mamba_state_dim=getattr(cfg.model, 'mamba_state_dim', 128),
-        #     mamba_head_dim=getattr(cfg.model, 'mamba_head_dim', 64),
-        #     mamba_num_groups=getattr(cfg.model, 'mamba_num_groups', 8),
-        #     mamba_num_heads=getattr(cfg.model, 'mamba_num_heads', 128),
-        #     num_attn_heads=cfg.model.num_attention_heads,
-        #     gqa=getattr(cfg.model, 'group_query_attention', False),
-        #     gqa_groups=getattr(cfg.model, 'num_query_groups', 8),
-        #     kv_channels=getattr(cfg.model, 'kv_channels', None),
-        #     mlp_expansion=cfg.model.ffn_hidden_size / cfg.model.hidden_size,
-        #     swiglu=getattr(cfg.model, 'gated_linear_unit', False),
-        #     vocab_size=cfg.tokenizer.padded_vocab_size,
-        # )
-        return 0
+        # Compute hybrid model FLOPs.
+        return hybrid_flops(
+            batch_size=batch_size,
+            seq_len=cfg.model.seq_length,
+            hidden_size=cfg.model.hidden_size,
+            num_attn_layers=num_attn_layers,
+            num_mamba_layers=num_mamba_layers,
+            num_mlp_layers=num_mlp_layers,
+            mamba_state_dim=getattr(cfg.model, "mamba_state_dim", 128),
+            mamba_head_dim=getattr(cfg.model, "mamba_head_dim", 64),
+            mamba_num_groups=getattr(cfg.model, "mamba_num_groups", 8),
+            mamba_num_heads=getattr(cfg.model, "mamba_num_heads", 128),
+            num_attn_heads=cfg.model.num_attention_heads,
+            gqa=getattr(cfg.model, "group_query_attention", False),
+            gqa_groups=getattr(cfg.model, "num_query_groups", 8),
+            kv_channels=getattr(cfg.model, "kv_channels", None),
+            mlp_expansion=cfg.model.ffn_hidden_size / cfg.model.hidden_size,
+            swiglu=getattr(cfg.model, "gated_linear_unit", False),
+            vocab_size=padded_vocab_size,
+        )
     else:
         # Compute standard Transformer model FLOPs.
         return transformer_flops()
diff --git a/tests/functional_tests/L2_Launch_utils.sh b/tests/functional_tests/L2_Launch_utils.sh
@@ -0,0 +1,23 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
+
+export CUDA_VISIBLE_DEVICES="0,1"
+
+uv run coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest \
+  -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA \
+  tests/functional_tests/utils
+coverage combine -q
diff --git a/tests/functional_tests/utils/test_flop_utils.py b/tests/functional_tests/utils/test_flop_utils.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for flop_utils module."""
+
+import importlib
+
+import pytest
+
+from megatron.bridge.training.utils.flop_utils import num_floating_point_operations
+from megatron.bridge.utils.vocab_utils import calculate_padded_vocab_size
+
+
+class TestFlops:
+    @pytest.mark.parametrize(
+        "model_family,model_config_func_name, seq_length, vocab_size, expected_flops",
+        [
+            ("llama", "llama3_8b_pretrain_config", 8192, 128256, 4.22e14),
+            ("llama", "llama3_70b_pretrain_config", 8192, 128256, 3.68e15),
+            ("qwen", "qwen3_30b_a3b_pretrain_config", 4096, 151643, 9.42e13),
+            ("qwen", "qwen3_235b_a22b_pretrain_config", 4096, 151643, 6.06e14),
+        ],
+    )
+    def test_flops(self, model_family, model_config_func_name, seq_length, vocab_size, expected_flops):
+        """
+        Test the number of floating point operations for a given model family and configuration.
+        For GBS=1
+
+        """
+        model_family_module = importlib.import_module(f"megatron.bridge.recipes.{model_family}")
+        cfg = getattr(model_family_module, model_config_func_name)()
+        cfg.model.finalize()
+
+        cfg.model.seq_length = seq_length
+        cfg.tokenizer.vocab_size = vocab_size
+
+        # Calculate padded vocab size to ensure it's divisible by tensor parallel size
+        cfg.tokenizer.padded_vocab_size = calculate_padded_vocab_size(
+            cfg.tokenizer.vocab_size,
+            cfg.model.make_vocab_size_divisible_by,
+            cfg.model.tensor_model_parallel_size,
+        )
+        cfg.model.vocab_size = cfg.tokenizer.padded_vocab_size
+
+        actual_num_flops = num_floating_point_operations(cfg, batch_size=1)
+        actual_num_flops_rounded = float(f"{actual_num_flops:.2e}")
+
+        assert actual_num_flops_rounded == expected_flops, (
+            f"Expected TFLops: {expected_flops:.2e} but got {actual_num_flops:.2e} with Padded Vocab Size: {cfg.tokenizer.padded_vocab_size} and Sequence len: {cfg.model.seq_length}"
+        )