Fix L3 test failures after merge

alextmagro · alextmagro · commit eb6bb3c44aca · 2026-03-09T21:09:18.000-05:00
diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py
@@ -1450,22 +1450,24 @@ def ref_func(x, w, data_layout):
         assert_allclose(primitive_w_grad, ref_w_grad, dtype=jnp.bfloat16)
 
     @pytest.mark.skipif(not is_fp8_supported, reason=fp8_unsupported_reason)
-    @pytest_parametrize_wrapper("m,n,k", TEST_SHAPES)
-    @pytest_parametrize_wrapper("scaling_mode", supported_scaling_modes)
+    @pytest_parametrize_wrapper("m,n,k", [(64, 128, 128), (128, 256, 256)])
+    @pytest_parametrize_wrapper("recipe", supported_recipes)
     @pytest_parametrize_wrapper("with_jax_gemm", [False, True])
     @pytest_parametrize_wrapper("use_bias", [False, True] if is_hip_extension() else [True])
-    def test_dense_grad_fp8(self, m, n, k, scaling_mode, with_jax_gemm, use_bias):
+    def test_dense_grad_fp8_and_fp4(self, m, n, k, recipe, with_jax_gemm, use_bias):
         data_layout = "NN"
         x, w, contracting_dims = self._generate_gemm_input(m, n, k, data_layout)
 
         key = jax.random.PRNGKey(1)
         bias = jax.random.uniform(key, n, dtype=jnp.bfloat16) if use_bias else None
 
-        if scaling_mode.is_1d_block_scaling():
+        if recipe.__class__.__name__ == "MXFP8BlockScaling":
             # Check for first GEMM
             _check_mxfp8_gemm_support(with_jax_gemm, m, n, k, use_bias)
             # Check for second GEMM
             _check_mxfp8_gemm_support(with_jax_gemm, m, k, n, use_bias)
+            # Check for third GEMM
+            _check_mxfp8_gemm_support(with_jax_gemm, k, n, m, use_bias)
 
 
         def primitive_func(x, w, bias, contracting_dims, quantizer_set):
@@ -1530,19 +1532,21 @@ def _ref_jax_norm_impl(x, gamma, beta, norm_type, zero_centered_gamma, eps, quan
 
 class TestFusedDense:
     @pytest.mark.skipif(not is_fp8_supported, reason=fp8_unsupported_reason)
-    @pytest.mark.parametrize("m,n,k", [(64, 128, 128)])
+    @pytest.mark.parametrize("m,n,k", [(64, 128, 128), (128, 256, 256)])
     @pytest_parametrize_wrapper("recipe", supported_recipes)
     @pytest.mark.parametrize("norm_type", ["layernorm", "rmsnorm"])
     @pytest_parametrize_wrapper("with_jax_gemm", [False, True])
     def test_layernorm_dense_grad(self, m, n, k, recipe, norm_type, with_jax_gemm):
         """
         Test layernorm_dense VJP Rule
         """
-        if scaling_mode.is_1d_block_scaling():
+        if recipe.__class__.__name__ == "MXFP8BlockScaling":
             # Check for fwd GEMM
             _check_mxfp8_gemm_support(with_jax_gemm, m, n, k)
-            # Check for bwd GEMM
+            # Check for first bwd GEMM
             _check_mxfp8_gemm_support(with_jax_gemm, m, k, n)
+            # Check for second bwd GEMM
+            _check_mxfp8_gemm_support(with_jax_gemm, k, n, m)
         # zero_centered_gamma is already tested in TestNorm
         zero_centered_gamma = False
         eps = 1e-6
@@ -1614,7 +1618,7 @@ def ref_func(x, w, gamma, beta):
             assert_allclose(prim_beta_grad, ref_beta_grad, dtype=quantizer_set.dgrad.q_dtype)
 
     @pytest.mark.skipif(not is_fp8_supported, reason=fp8_unsupported_reason)
-    @pytest.mark.parametrize("m,n,k", [(64, 128, 128)])
+    @pytest.mark.parametrize("m,n,k", [(64, 128, 128), (128, 256, 256)])
     @pytest.mark.parametrize("activation_type", [("gelu",), ("gelu", "linear")])
     @pytest_parametrize_wrapper("recipe", supported_recipes)
     @pytest.mark.parametrize("norm_type", ["layernorm", "rmsnorm"])
@@ -1626,11 +1630,13 @@ def test_layernorm_mlp_grad(
         """
         Test layernorm_mlp VJP Rule
         """
-        if scaling_mode.is_1d_block_scaling():
+        if recipe.__class__.__name__ == "MXFP8BlockScaling":
             # Check for first GEMM
             _check_mxfp8_gemm_support(with_jax_gemm, m, n, k, use_bias)
             # Check for second GEMM
             _check_mxfp8_gemm_support(with_jax_gemm, m, k, n, use_bias)
+            # Check for third GEMM
+            _check_mxfp8_gemm_support(with_jax_gemm, k, n, m, use_bias)
 
         # zero_centered_gamma is already tested in TestNorm
         zero_centered_gamma = False
diff --git a/tests/jax/test_distributed_layernorm_mlp.py b/tests/jax/test_distributed_layernorm_mlp.py
@@ -217,11 +217,11 @@ def _test_layernorm_mlp_grad(
             is_hip_extension()
             and (not with_jax_gemm)
             and use_bias
-            and (fp8_recipe is None)
+            and (quantization_recipe is None)
             and (dtype == jnp.bfloat16)
         ):
             pytest.xfail("Skip known failure case.")
-        if isinstance(fp8_recipe, recipe.MXFP8BlockScaling):
+        if isinstance(quantization_recipe, recipe.MXFP8BlockScaling):
             _check_mxfp8_layernorm_mlp_grad_support(
                 input_shape[0]*input_shape[1],
                 INTERMEDIATE,
@@ -410,7 +410,7 @@ def _test_layernorm_mlp(
         use_shardy,
         with_jax_gemm,
     ):
-        if isinstance(fp8_recipe, recipe.MXFP8BlockScaling):
+        if isinstance(quantization_recipe, recipe.MXFP8BlockScaling):
             _check_mxfp8_layernorm_mlp_support(
                 input_shape[0]*input_shape[1],
                 INTERMEDIATE,
diff --git a/tests/pytorch/distributed/run_fsdp2_fp8_model.py b/tests/pytorch/distributed/run_fsdp2_fp8_model.py
@@ -18,7 +18,7 @@
 from torch.distributed._composable.fsdp import fully_shard
 from torch.distributed.device_mesh import init_device_mesh
 from transformer_engine.pytorch import torch_version
-from transformer_engine.pytorch.fp8 import fp8_model_init
+from transformer_engine.pytorch.quantization import quantized_model_init
 from torch.nn.parallel import DistributedDataParallel as DDP
 from pathlib import Path
 
@@ -171,7 +171,7 @@ def _train(args):
         torch.cuda.memory._record_memory_history(enabled='all', context='all', stacks='all')
     if args.fp8_init:
         # Build the model with the specified context
-        with fp8_model_init(enabled = True):
+        with quantized_model_init(enabled=True):
             model = SimpleNet(args.input_size, args.hidden_size, args.output_size, use_fsdp2=args.use_fsdp2)
     else:
         model = SimpleNet(args.input_size, args.hidden_size, args.output_size, use_fsdp2=args.use_fsdp2)
diff --git a/tests/pytorch/distributed/test_torch_fsdp2_fp8.py b/tests/pytorch/distributed/test_torch_fsdp2_fp8.py
@@ -1,5 +1,5 @@
 #!/usr/bin/python3
-# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2025-2026, Advanced Micro Devices, Inc. All rights reserved.
 # See LICENSE for license information.
 
 import os
@@ -8,7 +8,7 @@
 import subprocess
 from pathlib import Path
 from transformer_engine.pytorch import torch_version
-from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
+from transformer_engine.pytorch.quantization import FP8GlobalStateManager
 import torch
 from run_fsdp2_fp8_model import SimpleNet
 
@@ -17,20 +17,15 @@
 
 NUM_PROCS: int = torch.cuda.device_count()
 
-def assert_allclose(
-    l1: List[torch.Tensor], l2: List[torch.Tensor], atol: float, rtol: float = None
-) -> bool:
-    """Ensures two lists are equal."""
+def assertEqual(
+    l1: List[torch.Tensor], l2: List[torch.Tensor]) -> bool:
+    """Ensures two lists are exactly equal."""
     assert len(l1) == len(l2), "Unequal number of outputs."
     for i, (t1, t2) in enumerate(zip(l1, l2)):
-        tols = dict(atol=atol)
-        if rtol is not None:
-            tols["rtol"] = rtol
-        result = torch.allclose(t1, t2, **tols)
+        result = torch.allclose(t1, t2, atol=0, rtol=0)
         if not result:
             diff = torch.abs(t1 - t2)
-            tol = atol + (rtol * torch.abs(t2))
-            exceed_mask = diff > tol
+            exceed_mask = diff > 0
             if exceed_mask.any():
                 indices = torch.nonzero(exceed_mask, as_tuple=True)
                 max_diff = diff[exceed_mask].max()
@@ -64,7 +59,7 @@ def _run_test(fp_init, recipe):
     for idx, (te_output_no_cache, te_output_cache) in enumerate(zip(output_fsdp, output_dp)):
     
         print(f"Comparing FSDP {te_output_no_cache[0]}, DDP {te_output_cache[0]} at index {idx}...")
-        assert_allclose(te_output_no_cache[1], te_output_cache[1], atol=0, rtol=0)
+        assertEqual(te_output_no_cache[1], te_output_cache[1]) # expects exact match
         print(f"Tensor at index {idx} passed comparison.")
 
 
diff --git a/transformer_engine/common/fused_attn_rocm/fused_attn.cpp b/transformer_engine/common/fused_attn_rocm/fused_attn.cpp
@@ -283,7 +283,7 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
   using namespace transformer_engine;
   
   // TODO: Add return_max_logit support
-  if (return_max_logit || cuda_graph) return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+  if (return_max_logit) return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
 
   // by default, fused attn is enabled
   bool nvte_fused_attn = true;
diff --git a/transformer_engine/jax/quantize/scaling_modes.py b/transformer_engine/jax/quantize/scaling_modes.py
@@ -26,7 +26,7 @@
 from transformer_engine_jax import JAXX_Scaling_Mode
 from .misc import QuantizeLayout
 from .device_utils import is_fp8_gemm_with_all_layouts_supported
-from ..util import is_hip_extension
+from ..util import is_hip_extension, get_jnp_float8_e4m3_type, get_jnp_float8_e5m2_type
 
 
 __all__ = [
@@ -1038,7 +1038,7 @@ def get_compatible_q_dtypes(self) -> set[jnp.dtype]:
             ScalingMode.CURRENT_TENSOR_SCALING,
             ScalingMode.MXFP8_1D_SCALING,
         ):
-            return {jnp.float8_e5m2, jnp.float8_e4m3fn}
+            return {get_jnp_float8_e5m2_type(), get_jnp_float8_e4m3_type()}
         if self in (ScalingMode.NVFP4_1D_SCALING, ScalingMode.NVFP4_2D_SCALING):
             return {jnp.float4_e2m1fn}
         if self == ScalingMode.NO_SCALING:
diff --git a/transformer_engine/pytorch/cpp_extensions/fused_attn.py b/transformer_engine/pytorch/cpp_extensions/fused_attn.py
@@ -266,7 +266,6 @@ def fused_attn_fwd(
 
     if IS_HIP_EXTENSION:
         assert not return_max_logit, "ROCm does not support return_max_logit yet."
-        assert not cuda_graph, "ROCm does not support cuda_graph."
 
     if attn_scale is None:
         d = q.size(-1)
diff --git a/transformer_engine/pytorch/quantization.py b/transformer_engine/pytorch/quantization.py
@@ -623,7 +623,7 @@ def autocast_exit(cls, enabled: bool, _graph: bool) -> None:
         # Reduce only the non-FP8 weight modules here.
         # FP8 weight modules are reduced at the end of the optimizer
         # step after the weight amax is populated.
-        if enabled and cls.AUTOCAST_DEPTH == 0 and not _graph and torch.is_grad_enabled():
+        if not cls.SKIP_FP8_REDUCTION_FOR_FSDP2 and enabled and cls.AUTOCAST_DEPTH == 0 and not _graph and torch.is_grad_enabled():
             # delayed scaling only function, for other recipes (current scaling with any granularity),
             # this is noop for other recipes because cls.global_amax_buffer is empty list
             cls.reduce_and_update_fp8_tensors(forward=True)