ROCm
diff --git a/‎build_tools/pytorch.py‎
Lines changed: 8 additions & 12 deletions b/‎build_tools/pytorch.py‎
Lines changed: 8 additions & 12 deletions
diff --git a/‎ci/pytorch.sh‎
Lines changed: 0 additions & 1 deletion b/‎ci/pytorch.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/pytorch/distributed/run_layer_with_overlap.py‎
Lines changed: 10 additions & 6 deletions b/‎tests/pytorch/distributed/run_layer_with_overlap.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎tests/pytorch/distributed/test_comm_gemm_overlap.py‎
Lines changed: 6 additions & 4 deletions b/‎tests/pytorch/distributed/test_comm_gemm_overlap.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp‎
Lines changed: 6 additions & 4 deletions b/‎transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp‎
Lines changed: 6 additions & 4 deletions
@@ -85,6 +85,14 @@ def setup_pytorch_extension(
             if version < (12, 0):
                 raise RuntimeError("Transformer Engine requires CUDA 12.0 or newer")
 
+    if bool(int(os.getenv("NVTE_UB_WITH_MPI", "0"))):
+        assert (
+            os.getenv("MPI_HOME") is not None
+        ), "MPI_HOME=/path/to/mpi must be set when compiling with NVTE_UB_WITH_MPI=1!"
+        mpi_path = Path(os.getenv("MPI_HOME"))
+        include_dirs.append(mpi_path / "include")
+        cxx_flags.append("-DNVTE_UB_WITH_MPI")
+
     library_dirs = []
     libraries = []
     if bool(int(os.getenv("NVTE_ENABLE_NVSHMEM", 0))):
@@ -104,17 +112,6 @@ def setup_pytorch_extension(
         libraries.append("mpi")
         cxx_flags.extend(["-DNVTE_ENABLE_ROCSHMEM", "-DOMPI_SKIP_MPICXX"])
 
-    extra_link_args = []
-    if bool(int(os.getenv("NVTE_UB_WITH_MPI", "0"))):
-        assert (
-            os.getenv("MPI_HOME") is not None
-        ), "MPI_HOME=/path/to/mpi must be set when compiling with NVTE_UB_WITH_MPI=1!"
-        mpi_path = Path(os.getenv("MPI_HOME", "/usr/lib/x86_64-linux-gnu/openmpi"))
-        include_dirs.append(mpi_path / "include")
-        library_dirs.append(mpi_path / "lib")
-        libraries.append("mpi")
-        cxx_flags.extend(["-DNVTE_UB_WITH_MPI", "-DOMPI_SKIP_MPICXX"])
-
     # Construct PyTorch CUDA extension
     sources = [str(path) for path in sources]
     include_dirs = [str(path) for path in include_dirs]
@@ -127,5 +124,4 @@ def setup_pytorch_extension(
         extra_compile_args={"cxx": cxx_flags},
         libraries=[str(lib) for lib in libraries],
         library_dirs=[str(lib_dir) for lib_dir in library_dirs],
-        extra_link_args=[str(arg) for arg in extra_link_args],
     )
@@ -92,7 +92,6 @@ run_test_config_mgpu(){
     #run in parallel on CI and it affects timing
     run_default_fa 1 test_gemm_sm_count.py
     run_default_fa 3 test_sanity_import.py
-    run_default_fa 3 distributed/test_fusible_ops_with_userbuffers.py
     run_default_fa 3 distributed/test_comm_gemm_overlap.py
     run_default_fa 2 distributed/test_fusible_ops.py
     run_default_fa 2 distributed/test_numerics.py
 
@@ -28,14 +28,17 @@
     MXFP8BlockScaling,
 )
 
+from torch.utils.cpp_extension import IS_HIP_EXTENSION
+
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", category=UserWarning)
 
-import transformer_engine.pytorch.cpp_extensions as tex
-os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
-if not tex.device_supports_multicast():
-    os.environ["UB_SKIPMC"] = "1"
+if IS_HIP_EXTENSION:
+    import transformer_engine.pytorch.cpp_extensions as tex
+    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+    if not tex.device_supports_multicast():
+        os.environ["UB_SKIPMC"] = "1"
 
 
 class multi_module_model(torch.nn.Module):
@@ -118,6 +121,7 @@ def _get_layer_args(config, tp_group, tp_size, num_layers, reference=False):
                 kwargs["input_layernorm"] = True
             else:
                 kwargs["ub_tp_comm_overlap"] = not reference
+                # Disable forward pass overlaps on HIP to isolate backward RS overlap
                 kwargs["hidden_dropout"] = 0.0
         kwargs["set_parallel_mode"] = True
         kwargs["ub_overlap_rs_dgrad"] = config.overlap_rs_dgrad and not reference
@@ -557,8 +561,8 @@ def run_fwd_bwd(model, x):
         # Now validate accuracy
         if not bool(numerics_failed.item()):
             for i, (test_g, ref_g) in enumerate(zip(test_grads, ref_grads)):
-                rtol = 0.125 if opts.fp8 else 0.025
-                atol = 0.0625 if opts.fp8 else 0.00125
+                rtol = 0.125 if opts.fp8 else 0.025 if not IS_HIP_EXTENSION else 5e-2
+                atol = 0.0625 if opts.fp8 else 0.00125 if not IS_HIP_EXTENSION else 1e-2
                 grad_failed, grad_info = _compare_tensors(names[i], test_g, ref_g, rtol, atol)
                 dist_print(grad_info, src=WORLD_RANK, error=grad_failed)
                 numerics_failed[0] = int(grad_failed)
 
@@ -73,7 +73,7 @@ def _run_gemm_with_overlap(comm_type, bulk, p2p, atomic, aggregate, quantization
         test_cmd.append("--bulk-overlap")
     else:
         if IS_HIP_EXTENSION and not p2p:
-            pytest.skip("HIP only supports A2A operations.")
+            pytest.skip("HIP only supports P2P operations.")
         if quantization == "fp8" and not fp8_available:
             pytest.skip(reason_for_no_fp8)
         if quantization == "mxfp8" and not mxfp8_available:
@@ -100,6 +100,9 @@ def _run_gemm_with_overlap(comm_type, bulk, p2p, atomic, aggregate, quantization
 def _run_layer_with_overlap(
     layer_type, linear_parallel_mode, overlap_rs_dgrad, fp8, quantization, num_layers=1
 ):
+    # Skip BULK overlap tests on HIP (column parallel or None with overlap_rs_dgrad=False)
+    if IS_HIP_EXTENSION and not overlap_rs_dgrad and linear_parallel_mode in ("column", None):
+        pytest.skip("Bulk overlap is not yet supported on HIP/ROCm.")
     test_path = TEST_ROOT / "run_layer_with_overlap.py"
     test_cmd = LAUNCH_CMD + [
         str(test_path),
@@ -163,6 +166,7 @@ def test_split_reduce_scatter_overlaps(quantization, p2p):
     _run_gemm_with_overlap("RS", False, p2p, False, False, quantization)
 
 
+@pytest.mark.skipif(IS_HIP_EXTENSION, reason="Bulk overlap is not yet supported on ROCm.")
 @pytest.mark.parametrize(
     "comm_type, quantization, connections",
     [
@@ -192,8 +196,6 @@ def test_bulk_overlaps(comm_type, quantization, connections):
                 "CUDA_DEVICE_MAX_CONNECTIONS=8 test only applies to devices with compute capability"
                 " 9.0 (HOPPER ARCH)."
             )
-        if IS_HIP_EXTENSION:
-            pytest.skip("HIP Does not support bulk overlaps with 8 connections.")
         os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "8"
         _run_gemm_with_overlap(comm_type, True, False, False, False, quantization)
         os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
@@ -228,7 +230,7 @@ def test_bulk_overlaps(comm_type, quantization, connections):
     ids=[
         f" {te.Linear.__name__} - ROW-PARALLEL ",
         f" {te.Linear.__name__} - COL-PARALLEL - BULK DGRAD/WGRAD ",
-        f" {te.Linear.__name__} - COL-PARLALEL - DGRAD+RS ",
+        f" {te.Linear.__name__} - COL-PARALLEL - DGRAD+RS ",
         f" {te.LayerNormLinear.__name__} - ROW-PARALLEL ",
         f" {te.LayerNormLinear.__name__} - COL-PARALLEL - BULK DGRAD/WGRAD ",
         f" {te.LayerNormLinear.__name__} - COL-PARALLEL - DGRAD+RS ",
 
@@ -206,6 +206,7 @@ TensorWrapper CommOverlapCore::get_tensor_chunk(const TensorWrapper &source, siz
   NVTE_DIM_CHECK(chunk_height > 0 && chunk_width > 0, "Attempted to get empty tensor chunk");
   NVTE_DIM_CHECK(chunk_height <= height && chunk_width <= width,
                  "Attempted to get out-of-bounds tensor chunk");
+#ifndef __HIP_PLATFORM_AMD__
   if (scaling_mode == NVTEScalingMode::NVTE_MXFP8_1D_SCALING) {
     // MXFP8 scale-inverses are padded to a 2D matrix with dims that
     // are divisible by 128. UB doesn't handle this padding yet.
@@ -214,6 +215,7 @@ TensorWrapper CommOverlapCore::get_tensor_chunk(const TensorWrapper &source, siz
     NVTE_DIM_CHECK(chunk_height % 128 == 0 && chunk_width % 128 == 0,
                    "Userbuffers requires MXFP8 tensor chunk dims that are divisible by 128");
   }
+#endif
 #undef NVTE_DIM_CHECK
 
   // Construct tensor chunk
@@ -726,12 +728,12 @@ void CommOverlapP2PBase::initialize(const std::vector<size_t> &buffer_shape, DTy
     NVTE_CHECK_CUDA(cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking, _comm_priority));
     _stream_send.push_back(std::move(stream));
   }
-  for (int i = 0; i < 7; i++) {
+  for (int i = 0; i < NVTE_ROCM_MAX_RINGS; i++) {
     cudaStream_t stream;
     NVTE_CHECK_CUDA(cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking, _comm_priority));
     l_stream_send.push_back(std::move(stream));
   }
-  for (int i = 0; i < 7; i++) {
+  for (int i = 0; i < NVTE_ROCM_MAX_RINGS; i++) {
     cudaStream_t stream;
     NVTE_CHECK_CUDA(cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking, _comm_priority));
     l_stream_recv.push_back(std::move(stream));
@@ -740,7 +742,7 @@ void CommOverlapP2PBase::initialize(const std::vector<size_t> &buffer_shape, DTy
       cudaStreamCreateWithPriority(&_stream_recv, cudaStreamNonBlocking, _comm_priority));
   NVTE_CHECK_CUDA(cudaEventCreateWithFlags(&_stop_send, 0));
   NVTE_CHECK_CUDA(cudaEventCreateWithFlags(&_stop_recv, 0));
-  for (int i = 0; i < 7; i++) {
+  for (int i = 0; i < NVTE_ROCM_MAX_RINGS; i++) {
     NVTE_CHECK_CUDA(cudaEventCreateWithFlags(&l_stop_recv[i], 0));
   }
 }
@@ -752,7 +754,7 @@ CommOverlapP2PBase::~CommOverlapP2PBase() {
   for (size_t i = 0; i < _stream_send.size(); i++) {
     cudaStreamDestroy(_stream_send[i]);
   }
-  for (int i = 0; i < 7; i++) {
+  for (int i = 0; i < NVTE_ROCM_MAX_RINGS; i++) {
     cudaStreamDestroy(l_stream_recv[i]);
     cudaStreamDestroy(l_stream_send[i]);
     cudaEventDestroy(l_stop_recv[i]);