Add torch.cond split-K decode dispatch to Qwen3.5 MoE attention

digantdesai · digantdesai · commit 5d3b62096672 · 2026-04-09T21:23:17.000-07:00
Runtime dispatch via torch.cond in FullAttention: split-K flash-decoding
for decode (L_q==1) and standard tiled SDPA for prefill (L_q&gt;1). Guard
sdpa_decode_splitk validation behind isinstance(L_q, int) so AOTI tracing
with symbolic shapes doesn't trip the L_q==1 check.

Align sdpa_decode_splitk signature with sdpa (dropout_p, is_causal,
enable_gqa) for drop-in use with torch.cond; unsupported args fail
with clear messages.

End-to-end on H100 (Qwen3.5-35B-A3B, HQQ-INT4, max_seq_len=4096,
1024 decode tokens, prompt="Hi", temperature=0, 5 runs median):

                Baseline (tiled)    Split-K     Speedup
  Decode tok/s         61.7          89.9        1.46x
  Prefill tok/s       378.2         378.2        1.00x

  nsys GPU time     13853 ms        8674 ms      1.60x
  SDPA kernel      5370 ms (38.8%)  209 ms (2.4%) 25.7x
diff --git a/backends/cuda/triton/kernels/sdpa.py b/backends/cuda/triton/kernels/sdpa.py
@@ -1390,26 +1390,48 @@ def sdpa_decode_splitk(
     key: torch.Tensor,
     value: torch.Tensor,
     attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
     scale: float = 0.0,
+    enable_gqa: bool = False,
 ) -> torch.Tensor:
+    """Split-K flash-decoding SDPA for L_q=1 (decode step).
+
+    Signature mirrors sdpa() for drop-in use with torch.cond dispatch.
+    enable_gqa is accepted but ignored — GQA is handled natively via
+    H_q // H_kv grouping; no packed-GQA tradeoff exists at L_q=1.
+    """
+    _validate_sdpa_inputs(query, key, value, dropout_p, enable_gqa)
+
     B, H_q, L_q, D = query.shape
     _, H_kv, L_kv, _ = key.shape
 
-    if L_q != 1:
-        raise RuntimeError(
-            f"sdpa_decode_splitk requires L_q == 1 (decode); got L_q={L_q}"
-        )
-    if H_q % H_kv != 0:
-        raise RuntimeError(
-            f"H_q must be divisible by H_kv; got H_q={H_q}, H_kv={H_kv}"
-        )
-    if not _is_power_of_2(D):
+    out = torch.empty((B, H_q, L_q, D), device=query.device, dtype=query.dtype)
+
+    if is_causal:
         raise RuntimeError(
-            f"sdpa_decode_splitk requires power-of-2 head dim; got D={D}"
+            "sdpa_decode_splitk does not support is_causal=True "
+            "(causal masking is a no-op at L_q=1; pass attn_mask instead)"
         )
 
+    # Validation — only check at runtime (concrete shapes), not during AOTI
+    # tracing where shapes are symbolic. torch.cond traces both branches with
+    # the same symbolic L_q, so L_q is not necessarily 1 during tracing.
+    if isinstance(L_q, int):
+        if L_q != 1:
+            raise RuntimeError(
+                f"sdpa_decode_splitk requires L_q == 1 (decode); got L_q={L_q}"
+            )
+        if H_q % H_kv != 0:
+            raise RuntimeError(
+                f"H_q must be divisible by H_kv; got H_q={H_q}, H_kv={H_kv}"
+            )
+        if not _is_power_of_2(D):
+            raise RuntimeError(
+                f"sdpa_decode_splitk requires power-of-2 head dim; got D={D}"
+            )
+
     num_groups = H_q // H_kv
-    out = torch.empty((B, H_q, L_q, D), device=query.device, dtype=query.dtype)
     sm_scale = 1.0 / math.sqrt(D) if scale == 0.0 else scale
     HAS_MASK, Mask_ptr, stride_mb, stride_mq, stride_mk = _prepare_mask_params(
         attn_mask, B, L_q, L_kv
@@ -1430,7 +1452,10 @@ def _sdpa_decode_splitk_abstract(
     key: torch.Tensor,
     value: torch.Tensor,
     attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
     scale: float = 0.0,
+    enable_gqa: bool = False,
 ) -> torch.Tensor:
     assert query.dtype == key.dtype == value.dtype, "Q, K, V must have the same dtype"
     B, H_q, L_q, D = query.shape
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
@@ -22,6 +22,8 @@
 import torch.nn as nn
 from torch.nn import functional as F
 
+from executorch.backends.cuda.triton.kernels.sdpa import sdpa, sdpa_decode_splitk
+
 
 # ---------------------------------------------------------------------------
 # Config
@@ -285,8 +287,14 @@ def forward(self, x, input_pos):
             )
         else:
             k, v = self.kv_cache.update(input_pos, k, v)
-            y = F.scaled_dot_product_attention(
-                q, k, v, attn_mask=attn_mask, enable_gqa=True
+            # Runtime dispatch via torch.cond:
+            #   decode (L_q==1): split-K flash-decoding for high KV occupancy
+            #   prefill (L_q>1): standard tiled SDPA
+            y = torch.cond(
+                q.shape[2] == 1,
+                lambda q, k, v, mask: sdpa_decode_splitk(q, k, v, attn_mask=mask),
+                lambda q, k, v, mask: sdpa(q, k, v, attn_mask=mask, enable_gqa=True),
+                [q, k, v, attn_mask],
             )
 
         y = y.transpose(1, 2).contiguous().view(B, T, -1)