diff --git a/example/ck_tile/50_sparse_attn/CMakeLists.txt b/example/ck_tile/50_sparse_attn/CMakeLists.txt
index 65bb2077642..b20a661805f 100644
--- a/example/ck_tile/50_sparse_attn/CMakeLists.txt
+++ b/example/ck_tile/50_sparse_attn/CMakeLists.txt
@@ -1,8 +1,8 @@
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-# CMakeLists.txt for sparse attention (Jenga and VSA)
+#Copyright(c) Advanced Micro Devices, Inc., or its affiliates.
+#SPDX - License - Identifier : MIT
+#CMakeLists.txt for sparse attention(Jenga and VSA)
 
-# Use SUPPORTED_GPU_TARGETS directly
+#Use SUPPORTED_GPU_TARGETS directly
 set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
 set(GPU_TARGETS ${SUPPORTED_GPU_TARGETS})
 
@@ -16,7 +16,7 @@ endif()
 
 message(STATUS "Building Sparse Attention (Jenga & VSA) for targets: ${INST_TARGETS}")
 
-# Code generation scripts
+#Code generation scripts
 file(GLOB_RECURSE CODE_GEN_SCRIPTS CONFIGURE_DEPENDS
   ${CMAKE_CURRENT_LIST_DIR}/generate.py
   ${CMAKE_CURRENT_LIST_DIR}/codegen/*.py
@@ -153,4 +153,47 @@ target_compile_options(${EXAMPLE_VSA_SPARSE_ATTN} PRIVATE
   -Wno-float-equal
 )
 
+# ============================================================================
+# Sparge BlockMap GPU Kernel (hand-written instantiation, no codegen)
+# ============================================================================
+set(SPARGE_BLOCKMAP_INSTANCES "tile_sparge_blockmap_instances")
+
+add_library(${SPARGE_BLOCKMAP_INSTANCES} OBJECT EXCLUDE_FROM_ALL
+  ${CMAKE_CURRENT_LIST_DIR}/sparge_blockmap_inst.cpp
+)
+target_include_directories(${SPARGE_BLOCKMAP_INSTANCES} PRIVATE
+  ${CMAKE_CURRENT_LIST_DIR}
+  ${PROJECT_SOURCE_DIR}/include/ck_tile/ops/sparse_attn
+)
+set_source_files_properties(
+  ${CMAKE_CURRENT_LIST_DIR}/sparge_blockmap_inst.cpp
+  PROPERTIES LANGUAGE HIP
+)
+set_property(TARGET ${SPARGE_BLOCKMAP_INSTANCES} PROPERTY HIP_ARCHITECTURES ${INST_TARGETS})
+
+target_compile_options(${SPARGE_BLOCKMAP_INSTANCES} PRIVATE
+  -DCK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
+  -DCK_TILE_FMHA_FWD_FAST_EXP2
+  -Wno-undefined-func-template
+  -Wno-float-equal
+)
+
+# ----------------------------------------------------------------------------
+# Build unified Sparge test: combines blockmap, Jenga, and VSA attention 
+# for end-to-end evaluation and timing in a single executable.
+# ----------------------------------------------------------------------------
+set(EXAMPLE_SPARGE "tile_example_sparge")
+message(DEBUG "adding example ${EXAMPLE_SPARGE}")
+add_executable(${EXAMPLE_SPARGE} EXCLUDE_FROM_ALL test_sparge.cpp)
+target_link_libraries(${EXAMPLE_SPARGE}
+  ${SPARSE_ATTN_JENGA_INSTANCES}
+  ${SPARSE_ATTN_VSA_INSTANCES}
+  ${SPARGE_BLOCKMAP_INSTANCES}
+)
+target_include_directories(${EXAMPLE_SPARGE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_compile_options(${EXAMPLE_SPARGE} PRIVATE
+  -Wno-undefined-func-template
+  -Wno-float-equal
+)
+
 set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
diff --git a/example/ck_tile/50_sparse_attn/README.md b/example/ck_tile/50_sparse_attn/README.md
new file mode 100644
index 00000000000..c7191c8e828
--- /dev/null
+++ b/example/ck_tile/50_sparse_attn/README.md
@@ -0,0 +1,45 @@
+# Sparge Attention (Composable Kernel)
+
+A Composable Kernel port of [SpargeAttn](https://github.com/thu-ml/SpargeAttn) for AMD GPU. Both the block-map pipeline (mean-pool → cosine sim → pooled QK → top-k LUT) and the sparse FMHA stage run on-GPU. Two attention backends are exposed via `-pipeline=vsa` (default, faster) and `-pipeline=jenga` (async K/V load variant).
+
+## Status vs Upstream
+
+Implemented:
+- per-block mean-pool, cosine similarity, pooled QK
+- top-k / `cdfthreshd` block selection, BlockMap LUT
+- sparse FMHA (both `vsa` and `jenga` backends)
+- per-head `topk` / `simthreshd1` / `cdfthreshd`
+
+Not yet ported (upstream pinned to commit [`ae5b629`](https://github.com/thu-ml/SpargeAttn/tree/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a)):
+- **K smoothing** — pre-pool `k -= km`; required for diffusion / video checkpoints (CogVideoX, Mochi-1, Flux, OpenSora, SD 3.5) ([spas_sage_attn/core.py:L53](https://github.com/thu-ml/SpargeAttn/blob/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a/spas_sage_attn/core.py#L53))
+- **is_causal mask in pooled score** — required for causal-LM prefill (Llama, Qwen) ([spas_sage_attn/utils.py:L338](https://github.com/thu-ml/SpargeAttn/blob/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a/spas_sage_attn/utils.py#L338))
+- **attention_sink** — column 0 forced ON; upstream is hard-wired to `True` at inference ([spas_sage_attn/autotune.py:L355](https://github.com/thu-ml/SpargeAttn/blob/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a/spas_sage_attn/autotune.py#L355))
+- **pv_threshold per-Q-tile skip in attn kernel** — pure perf, ~5–15% on the dominant attention slice ([spas_sage_attn/core.py:L265](https://github.com/thu-ml/SpargeAttn/blob/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a/spas_sage_attn/core.py#L265))
+- **Sort-based top-k selection** — replaces our O(N_k^2) iterative argmax; matters at long seqlen (s ≥ 16k) ([spas_sage_attn/utils.py:L345](https://github.com/thu-ml/SpargeAttn/blob/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a/spas_sage_attn/utils.py#L345))
+- **Q/K int8 quant fusion in pool kernel** — enables a downstream int8 GEMM0 in the attn kernel ([spas_sage_attn/utils.py:L371](https://github.com/thu-ml/SpargeAttn/blob/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a/spas_sage_attn/utils.py#L371))
+
+## Performance
+
+At b=2 h=32 s=16384 fp16, sparge (vsa backend) reaches **1.78× FMHA throughput at topk=0.4** and **5.04× at topk=0.1**, and stays above 1.0× across the full topk range.
+
+![Speedup vs sparsity](docs/speedup_vs_sparsity.png)
+
+*Speedup vs FMHA, b=2 h=32 s=16384 d=128 fp16. Shape chosen to match Fig. 10 of the SpargeAttn paper ([arXiv:2502.18137](https://arxiv.org/abs/2502.18137); Mochi-1, 22K context, head_dim=128); s=16384 is the closest grid point. Gray-outlined points have >30% inter-rep spread.*
+
+![Kernel breakdown](docs/kernel_breakdown.png)
+
+*BlockMap (`_pre`) stacked on attention (`_attn`), b=2 h=32 d=128 fp16 topk=0.4. BlockMap is roughly 17% of total at s=16384.*
+
+## Usage
+
+```bash
+ninja tile_example_sparge
+./bin/tile_example_sparge -pipeline=vsa -b=2 -h=32 -s=16384 -d=128 -topk=0.4 -simthreshd1=0.001
+```
+
+Add `-v=1` for CPU validation; use a small shape (`-b=1 -h=2 -s=512`), since full-shape CPU reference scales O(s²) and runs 30+ minutes at s=8k, hours at s=16k.
+
+## References
+
+- [SpargeAttn upstream](https://github.com/thu-ml/SpargeAttn) (pinned to [`ae5b629`](https://github.com/thu-ml/SpargeAttn/tree/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a))
+- [Paper — Zhang et al., arXiv:2502.18137](https://arxiv.org/abs/2502.18137)
diff --git a/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_jenga.py b/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_jenga.py
index a3d32652a98..fc4b8642ddd 100644
--- a/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_jenga.py
+++ b/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_jenga.py
@@ -141,6 +141,17 @@ def update_file(file_path, content):
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
     return ck_tile::launch_kernel(s, ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
 }}
+
+template<>
+void fmha_jenga_fwd_oneshot_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_jenga_fwd_args a)
+{{
+    using k_ = fmha_kernel_{F_idx};
+    auto [kargs, grids] = fmha_fwd_create_kargs_and_grids<k_>(a);
+    const dim3 blocks                      = k_::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
+    ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs)(
+        ck_tile::stream_config{{s.stream_id_}});
+}}
 """
 
 FMHA_FWD_API_FILENAME = "fmha_jenga_fwd_api.cpp"
@@ -219,6 +230,45 @@ def update_file(file_path, content):
             }}
 """
 
+FMHA_FWD_ONESHOT_API_FILENAME = "fmha_jenga_fwd_oneshot_api.cpp"
+FMHA_FWD_ONESHOT_API = """
+#include "fmha_fwd_trek.hpp"
+#include <iostream>
+
+void fmha_jenga_fwd_oneshot(fmha_jenga_fwd_traits t, fmha_jenga_fwd_args a, const ck_tile::stream_config& s){{
+
+    const bool has_load_tr = ck_tile::is_load_tr_supported();
+
+{F_dispatch}
+    std::cerr << "fmha_jenga_fwd_oneshot: no matching dispatch (dtype=" << t.data_type
+              << " hdim_q=" << t.hdim_q << " hdim_v=" << t.hdim_v
+              << " seqlen_q=" << a.seqlen_q << " seqlen_k=" << a.seqlen_k
+              << " mask=" << static_cast<int>(t.mask_type) << ")" << std::endl;
+}}
+"""
+
+FMHA_FWD_ONESHOT_API_PER_TRLOAD = """    {F_if}({F_trload_cond}){{
+{F_dtype_case}
+    }}
+"""
+
+FMHA_FWD_ONESHOT_API_PER_DTYPE = """    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
+{F_hdim_case}
+    }}
+"""
+FMHA_FWD_ONESHOT_API_PER_HDIM_CASE = """        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{
+{F_inner_dispatch}
+        }}
+"""
+
+FMHA_FWD_ONESHOT_API_INNER_DISPATCH = """            {F_if}((t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) &&
+                        ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
+                using trait_ = fmha_jenga_fwd_traits_<{F_hdim}, {F_dtype}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, false/*logits*/, {F_mask}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}>;
+                fmha_jenga_fwd_oneshot_<trait_>(s, a);
+                return;
+            }}
+"""
+
 
 @dataclass
 class CppConstraint:
@@ -274,10 +324,7 @@ def scheck(self) -> str:
 
     @property
     def seqtune(self) -> str:
-        if self.bm0 == 128:
-            return "true/*fall back to largest tile*/"  # group mode only generate spad/skpad == true
-        else:
-            return f"a.seqlen_q <= {self.bm0}"
+        return "true"
 
     @property
     def skcheck(self) -> str:
@@ -447,6 +494,67 @@ def api(self) -> str:
             per_tr_load += "    (void)t ; (void)s ; (void)a;"
         return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch=per_tr_load)
 
+    @property
+    def oneshot_api(self) -> str:
+        tr_load_cond_map = {"t": "has_load_tr", "f": "true"}
+
+        per_tr_load = str()
+        for tr_load in ["t", "f"]:
+            per_dtypes = str()
+            for i, dtype in enumerate(self.pool.keys()):
+                per_hdim_case = str()
+                for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
+                    traits = [
+                        t
+                        for t in self.pool[dtype][(hdim, hdim_v)]
+                        if tr_load == t.tr_load
+                    ]
+                    inners = str()
+                    for k, trait in enumerate(traits):
+                        if_k = "if" if k == 0 else "else if"
+                        inners = inners + FMHA_FWD_ONESHOT_API_INNER_DISPATCH.format(
+                            F_if=if_k,
+                            F_vlayout=LAYOUT_MAP[trait.vlayout],
+                            F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag],
+                            F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                            F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask],
+                            F_trload=BOOL_MAP[trait.tr_load],
+                            F_scheck=trait.scheck,
+                            F_seqtune=trait.seqtune,
+                            F_skcheck=trait.skcheck,
+                            F_dcheck=trait.dcheck,
+                            F_dvcheck=trait.dvcheck,
+                            F_constraint=trait.constraint,
+                            F_spad=BOOL_MAP[trait.spad],
+                            F_skpad=BOOL_MAP[trait.skpad],
+                            F_dpad=BOOL_MAP[trait.dpad],
+                            F_dvpad=BOOL_MAP[trait.dvpad],
+                            F_bm0=trait.bm0,
+                            F_bn0=trait.bn0,
+                            F_bk0=trait.bk0,
+                            F_bn1=trait.bn1,
+                            F_bk1=trait.bk1,
+                            F_bk0max=trait.bk0max,
+                            F_hdim=hdim,
+                            F_dtype=FWD_DTYPE_MAP[dtype],
+                        )
+                    if_j = "if" if j == 0 else "else if"
+                    per_hdim_case = per_hdim_case + FMHA_FWD_ONESHOT_API_PER_HDIM_CASE.format(
+                        F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners
+                    )
+                if_i = "if" if i == 0 else "else if"
+                per_dtypes = per_dtypes + FMHA_FWD_ONESHOT_API_PER_DTYPE.format(
+                    F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case
+                )
+            per_tr_load += FMHA_FWD_ONESHOT_API_PER_TRLOAD.format(
+                F_if="if",
+                F_trload_cond=tr_load_cond_map[tr_load],
+                F_dtype_case=per_dtypes,
+            )
+        if not per_tr_load:
+            per_tr_load += "    (void)t ; (void)s ; (void)a;"
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_ONESHOT_API.format(F_dispatch=per_tr_load)
+
 
 @dataclass
 class FmhaFwdTileSize:
@@ -582,38 +690,39 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
                 #              FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 # (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (128, 128): [
-                    FmhaFwdTileSize(  # fmt: skip
-                        16,
+                    FmhaFwdTileSize(  # fmt: skip  -- 128x128 tile (original, for old sparse attn test)
+                        128,
+                        128,
                         32,
-                        64,
                         128,
                         32,
                         128,
+                        4,
                         1,
                         1,
+                        4,
                         1,
                         1,
-                        1,
-                        1,
-                        16,
-                        16,
                         32,
-                        16,
+                        32,
                         16,
                         32,
+                        32,
+                        16,
                         -1,
+                        CppConstraint("t.bm0 == 0 || t.bm0 == 128"),
                     ),
-                    FmhaFwdTileSize(  # fmt: skip
-                        32,
-                        32,
+                    FmhaFwdTileSize(  # fmt: skip  -- 64x128 tile (for sparge blockmap kM0=64)
+                        64,
                         128,
+                        32,
                         128,
                         32,
                         128,
+                        2,
                         1,
                         1,
-                        1,
-                        1,
+                        2,
                         1,
                         1,
                         32,
@@ -623,18 +732,40 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
                         32,
                         16,
                         -1,
+                        CppConstraint("t.bm0 == 64"),
                     ),
                     FmhaFwdTileSize(  # fmt: skip
-                        128,
+                        16,
+                        32,
                         64,
+                        128,
                         32,
                         128,
+                        1,
+                        1,
+                        1,
+                        1,
+                        1,
+                        1,
+                        16,
+                        16,
+                        32,
                         16,
+                        16,
+                        32,
+                        -1,
+                    ),
+                    FmhaFwdTileSize(  # fmt: skip
+                        32,
+                        32,
+                        128,
+                        128,
+                        32,
                         128,
-                        4,
                         1,
                         1,
-                        4,
+                        1,
+                        1,
                         1,
                         1,
                         32,
@@ -647,10 +778,10 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
                     ),
                     FmhaFwdTileSize(  # fmt: skip
                         128,
-                        128,
+                        64,
                         32,
                         128,
-                        32,
+                        16,
                         128,
                         4,
                         1,
@@ -780,7 +911,7 @@ def get_fwd_blobs(
             for tile, pipeline in itertools.product(
                 tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl)
             ):
-                if tile.F_bm0 != 128 or tile.F_bn0 != 128:
+                if tile.F_bm0 not in (64, 128) or tile.F_bn0 != 128:
                     continue
                 if pipeline.tag != "qr_async":
                     continue
@@ -846,6 +977,7 @@ def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
 
 def write_fwd_api(api_pool: FmhaFwdApiPool, autogen_dir: Path) -> None:
     update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api)
+    update_file(autogen_dir / FMHA_FWD_ONESHOT_API_FILENAME, api_pool.oneshot_api)
 
 
 def write_blobs(
@@ -865,3 +997,4 @@ def list_blobs(
         for kernel in kernels:
             f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n")
         f.write((file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME).as_posix() + "\n")
+        f.write((file_path.parent / GEN_DIR / FMHA_FWD_ONESHOT_API_FILENAME).as_posix() + "\n")
diff --git a/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_vsa.py b/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_vsa.py
index 038738de246..208877037f1 100644
--- a/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_vsa.py
+++ b/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_vsa.py
@@ -141,6 +141,17 @@ def update_file(file_path, content):
     constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
     return ck_tile::launch_kernel(s, ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
 }}
+
+template<>
+void fmha_vsa_fwd_oneshot_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_vsa_fwd_args a)
+{{
+    using k_ = fmha_kernel_{F_idx};
+    auto [kargs, grids] = fmha_fwd_create_kargs_and_grids<k_>(a);
+    const dim3 blocks                      = k_::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
+    ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs)(
+        ck_tile::stream_config{{s.stream_id_}});
+}}
 """
 
 FMHA_FWD_API_FILENAME = "fmha_vsa_fwd_api.cpp"
@@ -219,6 +230,45 @@ def update_file(file_path, content):
             }}
 """
 
+FMHA_FWD_ONESHOT_API_FILENAME = "fmha_vsa_fwd_oneshot_api.cpp"
+FMHA_FWD_ONESHOT_API = """
+#include "fmha_fwd_trek.hpp"
+#include <iostream>
+
+void fmha_vsa_fwd_oneshot(fmha_vsa_fwd_traits t, fmha_vsa_fwd_args a, const ck_tile::stream_config& s){{
+
+    const bool has_load_tr = ck_tile::is_load_tr_supported();
+
+{F_dispatch}
+    std::cerr << "fmha_vsa_fwd_oneshot: no matching dispatch (dtype=" << t.data_type
+              << " hdim_q=" << t.hdim_q << " hdim_v=" << t.hdim_v
+              << " seqlen_q=" << a.seqlen_q << " seqlen_k=" << a.seqlen_k
+              << " mask=" << static_cast<int>(t.mask_type) << ")" << std::endl;
+}}
+"""
+
+FMHA_FWD_ONESHOT_API_PER_TRLOAD = """    {F_if}({F_trload_cond}){{
+{F_dtype_case}
+    }}
+"""
+
+FMHA_FWD_ONESHOT_API_PER_DTYPE = """    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
+{F_hdim_case}
+    }}
+"""
+FMHA_FWD_ONESHOT_API_PER_HDIM_CASE = """        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{
+{F_inner_dispatch}
+        }}
+"""
+
+FMHA_FWD_ONESHOT_API_INNER_DISPATCH = """            {F_if}((t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) &&
+                        ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
+                using trait_ = fmha_vsa_fwd_traits_<{F_hdim}, {F_dtype}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, false/*logits*/, {F_mask}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}>;
+                fmha_vsa_fwd_oneshot_<trait_>(s, a);
+                return;
+            }}
+"""
+
 
 @dataclass
 class CppConstraint:
@@ -274,10 +324,7 @@ def scheck(self) -> str:
 
     @property
     def seqtune(self) -> str:
-        if self.bm0 == 128:
-            return "true/*fall back to largest tile*/"  # group mode only generate spad/skpad == true
-        else:
-            return f"a.seqlen_q <= {self.bm0}"
+        return "true"
 
     @property
     def skcheck(self) -> str:
@@ -447,6 +494,67 @@ def api(self) -> str:
             per_tr_load += "    (void)t ; (void)s ; (void)a;"
         return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch=per_tr_load)
 
+    @property
+    def oneshot_api(self) -> str:
+        tr_load_cond_map = {"t": "has_load_tr", "f": "true"}
+
+        per_tr_load = str()
+        for tr_load in ["t", "f"]:
+            per_dtypes = str()
+            for i, dtype in enumerate(self.pool.keys()):
+                per_hdim_case = str()
+                for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
+                    traits = [
+                        t
+                        for t in self.pool[dtype][(hdim, hdim_v)]
+                        if tr_load == t.tr_load
+                    ]
+                    inners = str()
+                    for k, trait in enumerate(traits):
+                        if_k = "if" if k == 0 else "else if"
+                        inners = inners + FMHA_FWD_ONESHOT_API_INNER_DISPATCH.format(
+                            F_if=if_k,
+                            F_vlayout=LAYOUT_MAP[trait.vlayout],
+                            F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag],
+                            F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                            F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask],
+                            F_trload=BOOL_MAP[trait.tr_load],
+                            F_scheck=trait.scheck,
+                            F_seqtune=trait.seqtune,
+                            F_skcheck=trait.skcheck,
+                            F_dcheck=trait.dcheck,
+                            F_dvcheck=trait.dvcheck,
+                            F_constraint=trait.constraint,
+                            F_spad=BOOL_MAP[trait.spad],
+                            F_skpad=BOOL_MAP[trait.skpad],
+                            F_dpad=BOOL_MAP[trait.dpad],
+                            F_dvpad=BOOL_MAP[trait.dvpad],
+                            F_bm0=trait.bm0,
+                            F_bn0=trait.bn0,
+                            F_bk0=trait.bk0,
+                            F_bn1=trait.bn1,
+                            F_bk1=trait.bk1,
+                            F_bk0max=trait.bk0max,
+                            F_hdim=hdim,
+                            F_dtype=FWD_DTYPE_MAP[dtype],
+                        )
+                    if_j = "if" if j == 0 else "else if"
+                    per_hdim_case = per_hdim_case + FMHA_FWD_ONESHOT_API_PER_HDIM_CASE.format(
+                        F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners
+                    )
+                if_i = "if" if i == 0 else "else if"
+                per_dtypes = per_dtypes + FMHA_FWD_ONESHOT_API_PER_DTYPE.format(
+                    F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case
+                )
+            per_tr_load += FMHA_FWD_ONESHOT_API_PER_TRLOAD.format(
+                F_if="if",
+                F_trload_cond=tr_load_cond_map[tr_load],
+                F_dtype_case=per_dtypes,
+            )
+        if not per_tr_load:
+            per_tr_load += "    (void)t ; (void)s ; (void)a;"
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_ONESHOT_API.format(F_dispatch=per_tr_load)
+
 
 @dataclass
 class FmhaFwdTileSize:
@@ -582,38 +690,39 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
                 #              FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 # (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (128, 128): [
-                    FmhaFwdTileSize(  # fmt: skip
-                        16,
+                    FmhaFwdTileSize(  # fmt: skip  -- 128x128 tile (original, for old sparse attn test)
+                        128,
+                        128,
                         32,
-                        64,
                         128,
                         32,
                         128,
+                        4,
                         1,
                         1,
+                        4,
                         1,
                         1,
-                        1,
-                        1,
-                        16,
-                        16,
                         32,
-                        16,
+                        32,
                         16,
                         32,
+                        32,
+                        16,
                         -1,
+                        CppConstraint("t.bm0 == 0 || t.bm0 == 128"),
                     ),
-                    FmhaFwdTileSize(  # fmt: skip
-                        32,
-                        32,
+                    FmhaFwdTileSize(  # fmt: skip  -- 64x128 tile (for sparge blockmap kM0=64)
+                        64,
                         128,
+                        32,
                         128,
                         32,
                         128,
+                        2,
                         1,
                         1,
-                        1,
-                        1,
+                        2,
                         1,
                         1,
                         32,
@@ -623,18 +732,40 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
                         32,
                         16,
                         -1,
+                        CppConstraint("t.bm0 == 64"),
                     ),
                     FmhaFwdTileSize(  # fmt: skip
-                        128,
+                        16,
+                        32,
                         64,
+                        128,
                         32,
                         128,
+                        1,
+                        1,
+                        1,
+                        1,
+                        1,
+                        1,
+                        16,
+                        16,
+                        32,
                         16,
+                        16,
+                        32,
+                        -1,
+                    ),
+                    FmhaFwdTileSize(  # fmt: skip
+                        32,
+                        32,
+                        128,
+                        128,
+                        32,
                         128,
-                        4,
                         1,
                         1,
-                        4,
+                        1,
+                        1,
                         1,
                         1,
                         32,
@@ -647,10 +778,10 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
                     ),
                     FmhaFwdTileSize(  # fmt: skip
                         128,
-                        128,
+                        64,
                         32,
                         128,
-                        32,
+                        16,
                         128,
                         4,
                         1,
@@ -780,7 +911,7 @@ def get_fwd_blobs(
             for tile, pipeline in itertools.product(
                 tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl)
             ):
-                if tile.F_bm0 != 128 or tile.F_bn0 != 128:
+                if tile.F_bm0 not in (64, 128) or tile.F_bn0 != 128:
                     continue
                 if pipeline.tag != "qr_async_vsa":
                     continue
@@ -846,6 +977,7 @@ def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
 
 def write_fwd_api(api_pool: FmhaFwdApiPool, autogen_dir: Path) -> None:
     update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api)
+    update_file(autogen_dir / FMHA_FWD_ONESHOT_API_FILENAME, api_pool.oneshot_api)
 
 
 def write_blobs(
@@ -865,3 +997,4 @@ def list_blobs(
         for kernel in kernels:
             f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n")
         f.write((file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME).as_posix() + "\n")
+        f.write((file_path.parent / GEN_DIR / FMHA_FWD_ONESHOT_API_FILENAME).as_posix() + "\n")
diff --git a/example/ck_tile/50_sparse_attn/docs/kernel_breakdown.png b/example/ck_tile/50_sparse_attn/docs/kernel_breakdown.png
new file mode 100644
index 00000000000..8704334155c
Binary files /dev/null and b/example/ck_tile/50_sparse_attn/docs/kernel_breakdown.png differ
diff --git a/example/ck_tile/50_sparse_attn/docs/plot_sparge_perf.py b/example/ck_tile/50_sparse_attn/docs/plot_sparge_perf.py
new file mode 100644
index 00000000000..95a13d5f65c
--- /dev/null
+++ b/example/ck_tile/50_sparse_attn/docs/plot_sparge_perf.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""Plot sparge perf charts from full_grid.csv.
+
+Re-run with different fixed (b, h, s, dtype, topk) by editing the constants below.
+No GPU / no srun / no rebuild — pure matplotlib from CSV.
+"""
+import os
+import sys
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+# ----------------------------------------------------------------------
+# Tunable constants — edit these to regenerate for a different point.
+# ----------------------------------------------------------------------
+CSV_PATH        = "/home/AMD/ginolu12/gino_tmp/full_grid.csv"
+OUT_DIR         = os.path.dirname(os.path.abspath(__file__))
+
+# Chart 1 — speedup vs topk for one fixed (b, h, s, dtype)
+CHART1_B        = 2
+CHART1_H        = 32
+CHART1_S        = 16384
+CHART1_DTYPE    = "fp16"
+CHART1_HEAD_DIM = 128  # for title only
+
+# Chart 2 — kernel breakdown across s for fixed (b, h, dtype, topk)
+CHART2_B        = 2
+CHART2_H        = 32
+CHART2_DTYPE    = "fp16"
+CHART2_TOPK     = 0.4
+CHART2_S_LIST   = [2048, 4096, 8192, 16384]
+CHART2_HEAD_DIM = 128  # for title only
+
+DPI = 140
+
+# ----------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------
+def is_fail(note):
+    if not isinstance(note, str):
+        return False
+    return "FAIL" in note
+
+def is_high_spread(note):
+    if not isinstance(note, str):
+        return False
+    return "HIGH_SPREAD" in note
+
+def load_data():
+    df = pd.read_csv(CSV_PATH)
+    return df
+
+# ----------------------------------------------------------------------
+# Chart 1
+# ----------------------------------------------------------------------
+def plot_chart1(df, out_path):
+    sel = df[
+        (df["b"] == CHART1_B)
+        & (df["h"] == CHART1_H)
+        & (df["s"] == CHART1_S)
+        & (df["dtype"] == CHART1_DTYPE)
+    ].copy()
+    sel = sel.sort_values("topk").reset_index(drop=True)
+
+    if sel.empty:
+        print(f"[chart1] WARNING: no rows for b={CHART1_B} h={CHART1_H} s={CHART1_S} dtype={CHART1_DTYPE}")
+        return [], 0
+
+    # Drop fully failed rows but keep partial-fail rows; we'll mask per-series.
+    # Convert numeric columns
+    for col in ["sparge_jenga", "sparge_vsa", "sparse_jenga", "sparse_vsa", "fmha_us"]:
+        sel[col] = pd.to_numeric(sel[col], errors="coerce")
+
+    fmha = sel["fmha_us"]
+
+    # Compute speedups; rows with FAIL on a given column will have NaN already.
+    series = {
+        "sparge_vsa":   fmha / sel["sparge_vsa"],
+        "sparge_jenga": fmha / sel["sparge_jenga"],
+        "sparse_vsa":   fmha / sel["sparse_vsa"],
+        "sparse_jenga": fmha / sel["sparse_jenga"],
+    }
+
+    style = {
+        "sparge_vsa":   {"color": "#1f77b4", "marker": "o", "lw": 2.0},
+        "sparge_jenga": {"color": "#ff7f0e", "marker": "s", "lw": 2.0},
+        "sparse_vsa":   {"color": "#2ca02c", "marker": "^", "lw": 1.5, "ls": "--"},
+        "sparse_jenga": {"color": "#d62728", "marker": "v", "lw": 1.5, "ls": "--"},
+    }
+
+    fig, ax = plt.subplots(figsize=(8.5, 5.5), dpi=DPI)
+
+    x = sel["topk"].to_numpy()
+
+    # HIGH_SPREAD overlay first (under main markers)
+    hs_mask = sel["note"].apply(is_high_spread)
+    high_spread_cells = []
+    if hs_mask.any():
+        for _, row in sel[hs_mask].iterrows():
+            high_spread_cells.append((row["topk"], row["max_spread_pct"]))
+        # gray ring underneath every series's data point at that x
+        for label, sp in series.items():
+            xs_hs = x[hs_mask.to_numpy()]
+            ys_hs = sp[hs_mask.to_numpy()].to_numpy()
+            ax.scatter(xs_hs, ys_hs, s=180, facecolors="none",
+                       edgecolors="gray", linewidths=1.5, zorder=2)
+
+    for label, sp in series.items():
+        st = style[label]
+        ax.plot(x, sp.to_numpy(), label=label,
+                color=st["color"], marker=st["marker"],
+                linewidth=st["lw"], linestyle=st.get("ls", "-"),
+                markersize=7, zorder=3)
+
+    ax.axhline(1.0, color="black", linestyle=":", linewidth=1.2, label="fmha (baseline)", zorder=1)
+
+    ax.set_xlabel("topk (kept fraction)")
+    ax.set_ylabel("speedup vs FMHA dense (×)")
+    ax.set_title(
+        f"Speedup vs FMHA "
+        f"(b={CHART1_B} h={CHART1_H} s={CHART1_S} d={CHART1_HEAD_DIM} {CHART1_DTYPE})"
+    )
+    ax.grid(True, which="both", linestyle=":", alpha=0.6)
+    ax.set_xticks(np.arange(0.1, 0.71, 0.1))
+    ax.legend(loc="best", framealpha=0.9)
+
+    # Footnote about HIGH_SPREAD overlay
+    if high_spread_cells:
+        ax.text(0.01, -0.16,
+                "Gray rings: HIGH_SPREAD cells (high run-to-run variance)",
+                transform=ax.transAxes, fontsize=8, color="gray")
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=DPI, bbox_inches="tight")
+    plt.close(fig)
+    return high_spread_cells, os.path.getsize(out_path)
+
+
+# ----------------------------------------------------------------------
+# Chart 2
+# ----------------------------------------------------------------------
+def plot_chart2(df, out_path):
+    sel = df[
+        (df["b"] == CHART2_B)
+        & (df["h"] == CHART2_H)
+        & (df["dtype"] == CHART2_DTYPE)
+        & (np.isclose(df["topk"], CHART2_TOPK))
+        & (df["s"].isin(CHART2_S_LIST))
+    ].copy()
+    sel = sel.sort_values("s").reset_index(drop=True)
+
+    if sel.empty:
+        print(f"[chart2] WARNING: no rows for b={CHART2_B} h={CHART2_H} dtype={CHART2_DTYPE} topk={CHART2_TOPK}")
+        return 0
+
+    for col in ["sparge_jenga_pre", "sparge_jenga_attn",
+                "sparge_vsa_pre", "sparge_vsa_attn", "fmha_us"]:
+        sel[col] = pd.to_numeric(sel[col], errors="coerce")
+
+    s_vals = sel["s"].to_numpy()
+    n = len(s_vals)
+    idx = np.arange(n, dtype=float)
+
+    width = 0.35
+    offset = width / 2 + 0.02
+
+    fig, ax = plt.subplots(figsize=(9.0, 5.8), dpi=DPI)
+
+    # Jenga bars (left of group)
+    jenga_pre  = sel["sparge_jenga_pre"].to_numpy()
+    jenga_attn = sel["sparge_jenga_attn"].to_numpy()
+    vsa_pre    = sel["sparge_vsa_pre"].to_numpy()
+    vsa_attn   = sel["sparge_vsa_attn"].to_numpy()
+    fmha_vals  = sel["fmha_us"].to_numpy()
+
+    color_jenga_pre  = "#fdbf6f"  # light orange
+    color_jenga_attn = "#ff7f0e"  # orange
+    color_vsa_pre    = "#a6cee3"  # light blue
+    color_vsa_attn   = "#1f77b4"  # blue
+
+    bj_pre = ax.bar(idx - offset, jenga_pre, width,
+                    color=color_jenga_pre, edgecolor="black", linewidth=0.6,
+                    label="sparge_jenga _pre (BlockMap)")
+    bj_at  = ax.bar(idx - offset, jenga_attn, width, bottom=jenga_pre,
+                    color=color_jenga_attn, edgecolor="black", linewidth=0.6,
+                    label="sparge_jenga _attn")
+    bv_pre = ax.bar(idx + offset, vsa_pre, width,
+                    color=color_vsa_pre, edgecolor="black", linewidth=0.6,
+                    label="sparge_vsa _pre (BlockMap)")
+    bv_at  = ax.bar(idx + offset, vsa_attn, width, bottom=vsa_pre,
+                    color=color_vsa_attn, edgecolor="black", linewidth=0.6,
+                    label="sparge_vsa _attn")
+
+    # Add total labels on top of each stack
+    totals_jenga = jenga_pre + jenga_attn
+    totals_vsa   = vsa_pre + vsa_attn
+    for i in range(n):
+        ax.text(idx[i] - offset, totals_jenga[i], f"{totals_jenga[i]:.0f}",
+                ha="center", va="bottom", fontsize=8)
+        ax.text(idx[i] + offset, totals_vsa[i], f"{totals_vsa[i]:.0f}",
+                ha="center", va="bottom", fontsize=8)
+
+    # FMHA reference: short horizontal dashed segment per group
+    seg_half = 0.40
+    fmha_label_done = False
+    for i in range(n):
+        ax.hlines(fmha_vals[i], idx[i] - seg_half, idx[i] + seg_half,
+                  colors="black", linestyles="dashed", linewidth=1.2,
+                  label="fmha dense (reference)" if not fmha_label_done else None,
+                  zorder=5)
+        ax.text(idx[i] + seg_half + 0.02, fmha_vals[i],
+                f"fmha {fmha_vals[i]:.0f}", fontsize=7, va="center", color="black")
+        fmha_label_done = True
+
+    ax.set_xticks(idx)
+    ax.set_xticklabels([f"s={s}" for s in s_vals.astype(int)])
+    ax.set_xlabel("sequence length (s)")
+    ax.set_ylabel("kernel time (µs)")
+    ax.set_title(
+        f"Sparge kernel time breakdown "
+        f"(b={CHART2_B} h={CHART2_H} d={CHART2_HEAD_DIM} {CHART2_DTYPE}, topk={CHART2_TOPK})"
+    )
+    ax.grid(True, axis="y", linestyle=":", alpha=0.6)
+    ax.legend(loc="upper left", framealpha=0.9, fontsize=9)
+
+    # log-y is too aggressive — leave linear; bars will just be tall.
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=DPI, bbox_inches="tight")
+    plt.close(fig)
+    return os.path.getsize(out_path)
+
+
+# ----------------------------------------------------------------------
+# Main
+# ----------------------------------------------------------------------
+def main():
+    os.makedirs(OUT_DIR, exist_ok=True)
+    df = load_data()
+
+    chart1_path = os.path.join(OUT_DIR, "speedup_vs_sparsity.png")
+    chart2_path = os.path.join(OUT_DIR, "kernel_breakdown.png")
+
+    hs_cells, size1 = plot_chart1(df, chart1_path)
+    size2 = plot_chart2(df, chart2_path)
+
+    print(f"Wrote {chart1_path} ({size1} bytes)")
+    print(f"Wrote {chart2_path} ({size2} bytes)")
+
+    if hs_cells:
+        print("HIGH_SPREAD cells in chart-1 selection:")
+        for topk, pct in hs_cells:
+            print(f"  topk={topk} max_spread_pct={pct}")
+    else:
+        print("No HIGH_SPREAD cells in chart-1 selection.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/example/ck_tile/50_sparse_attn/docs/speedup_vs_sparsity.png b/example/ck_tile/50_sparse_attn/docs/speedup_vs_sparsity.png
new file mode 100644
index 00000000000..9a2f053b0b4
Binary files /dev/null and b/example/ck_tile/50_sparse_attn/docs/speedup_vs_sparsity.png differ
diff --git a/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp b/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp
index 7349c3576e8..62d40ffbe02 100644
--- a/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp
+++ b/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp
@@ -272,7 +272,7 @@ struct fmha_jenga_fwd_traits
     std::string data_type;
     bool is_v_rowmajor;
     mask_enum mask_type;
-    // TODO: padding check is inside this api
+    int bm0 = 0; // preferred Q-tile size; 0 = don't care (dispatch picks largest)
 };
 
 float fmha_jenga_fwd(fmha_jenga_fwd_traits, fmha_jenga_fwd_args, const ck_tile::stream_config&);
@@ -280,7 +280,10 @@ float fmha_jenga_fwd(fmha_jenga_fwd_traits, fmha_jenga_fwd_args, const ck_tile::
 template <typename Traits_>
 float fmha_jenga_fwd_(const ck_tile::stream_config&, fmha_jenga_fwd_args);
 
-float fmha_jenga_fwd(fmha_jenga_fwd_args, const ck_tile::stream_config&);
+template <typename Traits_>
+void fmha_jenga_fwd_oneshot_(const ck_tile::stream_config&, fmha_jenga_fwd_args);
+
+void fmha_jenga_fwd_oneshot(fmha_jenga_fwd_traits, fmha_jenga_fwd_args, const ck_tile::stream_config&);
 
 // VSA uses the same traits structure as Jenga; aliases for clarity
 template <ck_tile::index_t HDim_,
@@ -325,4 +328,7 @@ float fmha_vsa_fwd(fmha_vsa_fwd_traits, fmha_vsa_fwd_args, const ck_tile::stream
 template <typename Traits_>
 float fmha_vsa_fwd_(const ck_tile::stream_config&, fmha_vsa_fwd_args);
 
-float fmha_vsa_fwd(fmha_vsa_fwd_args, const ck_tile::stream_config&);
+template <typename Traits_>
+void fmha_vsa_fwd_oneshot_(const ck_tile::stream_config&, fmha_vsa_fwd_args);
+
+void fmha_vsa_fwd_oneshot(fmha_vsa_fwd_traits, fmha_vsa_fwd_args, const ck_tile::stream_config&);
diff --git a/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp b/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp
new file mode 100644
index 00000000000..3cc674f181f
--- /dev/null
+++ b/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp
@@ -0,0 +1,289 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+// Hand-written template instantiation for SpargeBlockMapKernel (fp16, D=128).
+
+#include "sparge_blockmap_trek.hpp"
+#include "ck_tile/ops/fmha/block/variants.hpp"
+
+#include <hip/hip_runtime.h>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+
+// ============================================================================
+// Type configuration for block map kernel (reuses FmhaSparseFwdTypeConfig)
+// ============================================================================
+
+// fp16: D=128, kM0=64, kN0=128
+using bmap_fp16_block_tile = ck_tile::sequence<64, 128, 128, 128, 128, 128>;
+//                                              kM0 kN0  kK0  kN1  kK1  kQKHeaddim(D)
+
+using bmap_fp16_shape =
+    ck_tile::TileFmhaShape<bmap_fp16_block_tile,
+                           ck_tile::sequence<4, 1, 1>,    // Gemm0BlockWarps
+                           ck_tile::sequence<16, 16, 16>, // Gemm0WarpTile (unused by blockmap, but
+                                                          // needed by shape)
+                           ck_tile::sequence<4, 1, 1>,    // Gemm1BlockWarps
+                           ck_tile::sequence<16, 16, 16>, // Gemm1WarpTile
+                           true>;                         // VLayout row-major
+
+using bmap_fp16_trait = ck_tile::TileFmhaTraits<true,  // kPadSeqLenQ
+                                                true,  // kPadSeqLenK
+                                                true,  // kPadHeadDimQ
+                                                true,  // kPadHeadDimV
+                                                false, // kHasLogitsSoftCap
+                                                ck_tile::BlockAttentionBiasEnum::NO_BIAS,
+                                                false, // kStoreLSE
+                                                false, // kHasDropout
+                                                false, // kHasRandVal
+                                                ck_tile::BlockAttentionQuantScaleEnum::NO_SCALE,
+                                                -1,     // kBlockPerCu
+                                                false>; // kIsVRowMajorSkip
+
+using bmap_fp16_variant = ck_tile::ComposedAttention<0, CK_TILE_FMHA_FWD_FAST_EXP2>;
+using bmap_fp16_mask    = ck_tile::GenericAttentionMask<false>;
+
+using bmap_fp16_problem = ck_tile::BlockFmhaPipelineProblem<ck_tile::half_t, // QDataType
+                                                            ck_tile::half_t, // KDataType
+                                                            ck_tile::half_t, // VDataType
+                                                            float,           // SaccDataType
+                                                            float,           // SMPLComputeDataType
+                                                            ck_tile::half_t, // BiasDataType
+                                                            uint8_t, // RandValOutputDataType
+                                                            float,   // LSEDataType
+                                                            ck_tile::half_t, // PDataType
+                                                            float,           // OaccDataType
+                                                            ck_tile::half_t, // ODataType
+                                                            bmap_fp16_shape,
+                                                            false, // kIsGroupMode
+                                                            bmap_fp16_variant,
+                                                            bmap_fp16_mask,
+                                                            false, // kUseTrLoad
+                                                            bmap_fp16_trait>;
+
+using bmap_fp16_pipeline = ck_tile::SpargeBlockMapPipeline<bmap_fp16_problem>;
+using bmap_fp16_kernel   = ck_tile::SpargeBlockMapKernel<bmap_fp16_pipeline>;
+
+using kstats_fp16_pipeline = ck_tile::SpargeKStatsPipeline<bmap_fp16_problem>;
+using kstats_fp16_kernel   = ck_tile::SpargeKStatsKernel<kstats_fp16_pipeline>;
+
+// ============================================================================
+// bf16: D=128, kM0=64, kN0=128
+// ============================================================================
+
+using bmap_bf16_block_tile = ck_tile::sequence<64, 128, 128, 128, 128, 128>;
+
+using bmap_bf16_shape =
+    ck_tile::TileFmhaShape<bmap_bf16_block_tile,
+                           ck_tile::sequence<4, 1, 1>,
+                           ck_tile::sequence<16, 16, 16>,
+                           ck_tile::sequence<4, 1, 1>,
+                           ck_tile::sequence<16, 16, 16>,
+                           true>;
+
+using bmap_bf16_trait = ck_tile::TileFmhaTraits<true,  // kPadSeqLenQ
+                                                true,  // kPadSeqLenK
+                                                true,  // kPadHeadDimQ
+                                                true,  // kPadHeadDimV
+                                                false, // kHasLogitsSoftCap
+                                                ck_tile::BlockAttentionBiasEnum::NO_BIAS,
+                                                false, // kStoreLSE
+                                                false, // kHasDropout
+                                                false, // kHasRandVal
+                                                ck_tile::BlockAttentionQuantScaleEnum::NO_SCALE,
+                                                -1,
+                                                false>;
+
+using bmap_bf16_variant = ck_tile::ComposedAttention<0, CK_TILE_FMHA_FWD_FAST_EXP2>;
+using bmap_bf16_mask    = ck_tile::GenericAttentionMask<false>;
+
+using bmap_bf16_problem = ck_tile::BlockFmhaPipelineProblem<ck_tile::bf16_t,  // QDataType
+                                                            ck_tile::bf16_t,  // KDataType
+                                                            ck_tile::bf16_t,  // VDataType
+                                                            float,            // SaccDataType
+                                                            float,            // SMPLComputeDataType
+                                                            ck_tile::bf16_t,  // BiasDataType
+                                                            uint8_t,          // RandValOutputDataType
+                                                            float,            // LSEDataType
+                                                            ck_tile::bf16_t,  // PDataType
+                                                            float,            // OaccDataType
+                                                            ck_tile::bf16_t,  // ODataType
+                                                            bmap_bf16_shape,
+                                                            false, // kIsGroupMode
+                                                            bmap_bf16_variant,
+                                                            bmap_bf16_mask,
+                                                            false, // kUseTrLoad
+                                                            bmap_bf16_trait>;
+
+using bmap_bf16_pipeline = ck_tile::SpargeBlockMapPipeline<bmap_bf16_problem>;
+using bmap_bf16_kernel   = ck_tile::SpargeBlockMapKernel<bmap_bf16_pipeline>;
+
+using kstats_bf16_pipeline = ck_tile::SpargeKStatsPipeline<bmap_bf16_problem>;
+using kstats_bf16_kernel   = ck_tile::SpargeKStatsKernel<kstats_bf16_pipeline>;
+
+// ============================================================================
+// Internal K-stat workspace (R20): process-lifetime lazy hipMalloc, sized
+// to the largest (batch, nhead_k, N_k, D) seen so far. Caller API unchanged.
+// ============================================================================
+
+namespace {
+
+struct KStatsWorkspace
+{
+    void* pooled_k_dev = nullptr; // [batch, nhead_k, N_k, D] fp32
+    void* sim_k_dev    = nullptr; // [batch, nhead_k, N_k] uint8
+    size_t pooled_k_bytes = 0;
+    size_t sim_k_bytes    = 0;
+
+    void ensure(int batch, int nhead_k, int N_k, int D)
+    {
+        const size_t need_p = static_cast<size_t>(batch) * nhead_k * N_k * D * sizeof(float);
+        const size_t need_s = static_cast<size_t>(batch) * nhead_k * N_k * sizeof(uint8_t);
+        if(need_p > pooled_k_bytes)
+        {
+            if(pooled_k_dev != nullptr) (void)hipFree(pooled_k_dev);
+            (void)hipMalloc(&pooled_k_dev, need_p);
+            pooled_k_bytes = need_p;
+        }
+        if(need_s > sim_k_bytes)
+        {
+            if(sim_k_dev != nullptr) (void)hipFree(sim_k_dev);
+            (void)hipMalloc(&sim_k_dev, need_s);
+            sim_k_bytes = need_s;
+        }
+    }
+};
+
+KStatsWorkspace& g_kstats_ws()
+{
+    static KStatsWorkspace ws;
+    return ws;
+}
+
+template <typename KStatsKernel, typename BlockMapKernel>
+void launch_kstats_then_blockmap(sparge_blockmap_args args, const ck_tile::stream_config& s)
+{
+    const int N_k = ck_tile::integer_divide_ceil(args.seqlen_k, BlockMapKernel::kN0);
+    const int D   = BlockMapKernel::D;
+    auto& ws      = g_kstats_ws();
+    ws.ensure(args.batch, args.nhead_k, N_k, D);
+
+    // Stage 1: K stats
+    {
+        auto [kargs, grids] =
+            sparge_kstats_create_kargs_and_grids<KStatsKernel>(args, ws.pooled_k_dev, ws.sim_k_dev);
+        const dim3 blocks                      = KStatsKernel::BlockSize();
+        constexpr ck_tile::index_t kBlockPerCu = KStatsKernel::kBlockPerCu;
+        ck_tile::make_kernel<kBlockPerCu>(KStatsKernel{}, grids, blocks, 0, kargs)(
+            ck_tile::stream_config{s.stream_id_});
+    }
+    // Stage 2: block_map (reads ws)
+    {
+        auto [kargs, grids] = sparge_blockmap_create_kargs_and_grids<BlockMapKernel>(
+            args, ws.pooled_k_dev, ws.sim_k_dev);
+        const dim3 blocks                      = BlockMapKernel::BlockSize();
+        constexpr ck_tile::index_t kBlockPerCu = BlockMapKernel::kBlockPerCu;
+        ck_tile::make_kernel<kBlockPerCu>(BlockMapKernel{}, grids, blocks, 0, kargs)(
+            ck_tile::stream_config{s.stream_id_});
+    }
+}
+
+} // namespace
+
+// ============================================================================
+// Dispatch
+// ============================================================================
+
+float sparge_blockmap_fwd(sparge_blockmap_traits traits,
+                          sparge_blockmap_args args,
+                          const ck_tile::stream_config& s)
+{
+    if(traits.data_type == "fp16" && traits.hdim_q == 128)
+    {
+        if(s.log_level_ > 0)
+            std::cout << ", sparge_blockmap_fp16_d128" << std::flush;
+        return ck_tile::launch_kernel(s, [=](const ck_tile::stream_config& s_) {
+            launch_kstats_then_blockmap<kstats_fp16_kernel, bmap_fp16_kernel>(args, s_);
+        });
+    }
+
+    if(traits.data_type == "bf16" && traits.hdim_q == 128)
+    {
+        if(s.log_level_ > 0)
+            std::cout << ", sparge_blockmap_bf16_d128" << std::flush;
+        return ck_tile::launch_kernel(s, [=](const ck_tile::stream_config& s_) {
+            launch_kstats_then_blockmap<kstats_bf16_kernel, bmap_bf16_kernel>(args, s_);
+        });
+    }
+
+    if(s.log_level_ > 0)
+        std::cerr << "sparge_blockmap_fwd: unsupported config (data_type=" << traits.data_type
+                  << ", hdim_q=" << traits.hdim_q << ")" << std::endl;
+    return -1.f;
+}
+
+// ============================================================================
+// Oneshot version: launches kernel without timing wrapper
+// ============================================================================
+
+void sparge_blockmap_fwd_oneshot(sparge_blockmap_traits traits,
+                                 sparge_blockmap_args args,
+                                 const ck_tile::stream_config& s)
+{
+    if(traits.data_type == "fp16" && traits.hdim_q == 128)
+    {
+        launch_kstats_then_blockmap<kstats_fp16_kernel, bmap_fp16_kernel>(args, s);
+        return;
+    }
+
+    if(traits.data_type == "bf16" && traits.hdim_q == 128)
+    {
+        launch_kstats_then_blockmap<kstats_bf16_kernel, bmap_bf16_kernel>(args, s);
+        return;
+    }
+
+    std::cerr << "sparge_blockmap_fwd_oneshot: unsupported config (data_type=" << traits.data_type
+              << ", hdim_q=" << traits.hdim_q << ")" << std::endl;
+}
+
+// ============================================================================
+// Combined functions: blockmap + attention timed together via launch_kernel
+// ============================================================================
+
+float sparge_jenga_fwd(sparge_blockmap_traits bmap_t, sparge_blockmap_args bmap_a,
+                       fmha_jenga_fwd_traits attn_t, fmha_jenga_fwd_args attn_a,
+                       const ck_tile::stream_config& s)
+{
+    if(s.log_level_ > 0)
+        std::cout << ", sparge_blockmap_" << bmap_t.data_type << "_d" << bmap_t.hdim_q
+                  << ", fmha_jenga_fwd_" << attn_t.data_type << "_d" << attn_t.hdim_q
+                  << std::flush;
+
+    return ck_tile::launch_kernel(
+        s,
+        [=](const ck_tile::stream_config& s_) {
+            sparge_blockmap_fwd_oneshot(bmap_t, bmap_a, s_);
+        },
+        [=](const ck_tile::stream_config& s_) {
+            fmha_jenga_fwd_oneshot(attn_t, attn_a, s_);
+        });
+}
+
+float sparge_vsa_fwd_combined(sparge_blockmap_traits bmap_t, sparge_blockmap_args bmap_a,
+                              fmha_vsa_fwd_traits attn_t, fmha_vsa_fwd_args attn_a,
+                              const ck_tile::stream_config& s)
+{
+    if(s.log_level_ > 0)
+        std::cout << ", sparge_blockmap_" << bmap_t.data_type << "_d" << bmap_t.hdim_q
+                  << ", fmha_vsa_fwd_" << attn_t.data_type << "_d" << attn_t.hdim_q
+                  << std::flush;
+
+    return ck_tile::launch_kernel(
+        s,
+        [=](const ck_tile::stream_config& s_) {
+            sparge_blockmap_fwd_oneshot(bmap_t, bmap_a, s_);
+        },
+        [=](const ck_tile::stream_config& s_) {
+            fmha_vsa_fwd_oneshot(attn_t, attn_a, s_);
+        });
+}
diff --git a/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp b/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp
new file mode 100644
index 00000000000..92c32d29e85
--- /dev/null
+++ b/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp
@@ -0,0 +1,145 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp"
+#include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp"
+#include "ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp"
+#include "ck_tile/ops/sparse_attn/pipeline/sparge_kstats_pipeline.hpp"
+#include "ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp"
+#include "ck_tile/ops/sparse_attn/kernel/sparge_kstats_kernel.hpp"
+
+#include "fmha_fwd_trek.hpp"
+
+#include <string>
+#include <type_traits>
+
+// ============================================================================
+// Args and traits for sparge block map GPU kernel
+// ============================================================================
+struct sparge_blockmap_args
+{
+    const void* q_ptr;
+    const void* k_ptr;
+
+    ck_tile::index_t batch;
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+    ck_tile::index_t hdim_q;
+    ck_tile::index_t nhead_q;
+    ck_tile::index_t nhead_k;
+
+    ck_tile::index_t stride_q;
+    ck_tile::index_t stride_k;
+    ck_tile::index_t nhead_stride_q;
+    ck_tile::index_t nhead_stride_k;
+    ck_tile::index_t batch_stride_q;
+    ck_tile::index_t batch_stride_k;
+
+    float simthreshd1;
+    float cdfthreshd;
+    float topk;
+    float scale;
+
+    void* block_map_ptr;
+    void* lut_ptr;
+    void* valid_block_num_ptr;
+
+    // R21A Phase 4 + R21B fix: optional per-head superparams. nullptr => use scalar.
+    // Buffer sizes match SpargeAttn upstream contract (utils.py:324-328: all sized
+    // by Headnum=q.size(1)=nhead_q). K-side kernel still indexes [hk] into the
+    // first nhead_k entries — for MHA equivalent to old [nhead_k] sizing, for
+    // MQA/GQA aligns to upstream tuned ckpt layout.
+    const float* simthreshd1_per_head_ptr = nullptr; // size = nhead_q floats (kernel reads [0..nhead_k-1])
+    const float* cdfthreshd_per_head_ptr  = nullptr; // size = nhead_q floats
+    const float* topk_per_head_ptr        = nullptr; // size = nhead_q floats
+};
+
+struct sparge_blockmap_traits
+{
+    std::string data_type;
+    int hdim_q;
+};
+
+// ============================================================================
+// Create kernel args and grid dimensions
+// ============================================================================
+template <typename BlockMapKernel>
+auto sparge_blockmap_create_kargs_and_grids(sparge_blockmap_args args,
+                                            const void* pooled_k_ws_ptr,
+                                            const void* sim_k_ws_ptr)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = BlockMapKernel::MakeKargs(args.q_ptr,
+                                           args.k_ptr,
+                                           args.seqlen_q,
+                                           args.seqlen_k,
+                                           args.hdim_q,
+                                           args.nhead_q,
+                                           args.nhead_q / args.nhead_k,
+                                           args.stride_q,
+                                           args.stride_k,
+                                           args.nhead_stride_q,
+                                           args.nhead_stride_k,
+                                           args.batch_stride_q,
+                                           args.batch_stride_k,
+                                           args.simthreshd1,
+                                           args.cdfthreshd,
+                                           args.topk,
+                                           args.scale,
+                                           args.block_map_ptr,
+                                           args.lut_ptr,
+                                           args.valid_block_num_ptr,
+                                           pooled_k_ws_ptr,
+                                           sim_k_ws_ptr,
+                                           args.topk_per_head_ptr,
+                                           args.cdfthreshd_per_head_ptr);
+
+    dim3 grids = BlockMapKernel::GridSize(args.batch, args.nhead_q, args.seqlen_q);
+    return ck_tile::make_tuple(kargs, grids);
+}
+
+template <typename KStatsKernel>
+auto sparge_kstats_create_kargs_and_grids(sparge_blockmap_args args,
+                                          void* pooled_k_ws_ptr,
+                                          void* sim_k_ws_ptr)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = KStatsKernel::MakeKargs(args.k_ptr,
+                                         args.seqlen_k,
+                                         args.hdim_q,
+                                         args.nhead_k,
+                                         args.stride_k,
+                                         args.nhead_stride_k,
+                                         args.batch_stride_k,
+                                         args.simthreshd1,
+                                         pooled_k_ws_ptr,
+                                         sim_k_ws_ptr,
+                                         args.simthreshd1_per_head_ptr);
+
+    dim3 grids = KStatsKernel::GridSize(args.batch, args.nhead_k, args.seqlen_k);
+    return ck_tile::make_tuple(kargs, grids);
+}
+
+// ============================================================================
+// Hand-written template instantiation dispatch
+// ============================================================================
+float sparge_blockmap_fwd(sparge_blockmap_traits traits,
+                          sparge_blockmap_args args,
+                          const ck_tile::stream_config& stream_config);
+
+void sparge_blockmap_fwd_oneshot(sparge_blockmap_traits traits,
+                                 sparge_blockmap_args args,
+                                 const ck_tile::stream_config& stream_config);
+
+// Combined functions: blockmap + attention with unified timing
+float sparge_jenga_fwd(sparge_blockmap_traits, sparge_blockmap_args,
+                       fmha_jenga_fwd_traits, fmha_jenga_fwd_args,
+                       const ck_tile::stream_config&);
+
+float sparge_vsa_fwd_combined(sparge_blockmap_traits, sparge_blockmap_args,
+                              fmha_vsa_fwd_traits, fmha_vsa_fwd_args,
+                              const ck_tile::stream_config&);
diff --git a/example/ck_tile/50_sparse_attn/sparge_tool.hpp b/example/ck_tile/50_sparse_attn/sparge_tool.hpp
new file mode 100644
index 00000000000..49c69cc6f74
--- /dev/null
+++ b/example/ck_tile/50_sparse_attn/sparge_tool.hpp
@@ -0,0 +1,408 @@
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <numeric>
+#include <utility>
+#include <vector>
+#include <cassert>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace sparge {
+
+struct SpargeParams
+{
+    int BLKQ = 128;
+    int BLKK = 128;
+
+    // Similarity gate threshold (TODO: per-head support).
+    float simthreshd1 = 0.6f;
+
+    // Exactly one of the following should be used:
+    // - Use CDF threshold if topk < 0
+    // - Both should be in [0, 1] <-- NEED TO CHECK THIS
+    float cdfthreshd = 0.98f;
+    float topk       = -1.0f;
+
+    // If true, treat Q/K as BHSD; otherwise BSHD (same convention as CK examples).
+    bool i_perm = true;
+};
+
+// Output format CK VSA expects.
+struct VSALut
+{
+    ck_tile::HostTensor<int32_t> lut;             // [B, Hq, Q_blk, K_blk] delta-encoded
+    ck_tile::HostTensor<int32_t> valid_block_num; // [B, Hq, Q_blk]
+};
+
+namespace detail {
+
+template <typename T>
+inline float to_f32(const T& x)
+{
+    return ck_tile::type_convert<float>(x);
+}
+
+// Read element from HostTensor with either BHSD or BSHD layout.
+// Q: [B, Hq, Sq, D] if i_perm else [B, Sq, Hq, D]
+// K: [B, Hk, Sk, D] if i_perm else [B, Sk, Hk, D]
+template <typename T>
+inline float load(const ck_tile::HostTensor<T>& X, bool i_perm, int b, int h, int s, int d)
+{
+    return i_perm ? to_f32(X(b, h, s, d)) : to_f32(X(b, s, h, d));
+}
+
+// Compute pooled mean vector of one block: mean over tokens in [s0, s1).
+template <typename T>
+std::vector<float>
+pooled_mean_block(const ck_tile::HostTensor<T>& X, bool i_perm, int b, int h, int s0, int s1, int d)
+{
+    std::vector<float> mean(d, 0.0f);
+    const int bs = std::max(0, s1 - s0);
+    if(bs == 0)
+        return mean;
+
+    for(int s = s0; s < s1; ++s)
+    {
+        for(int d_ = 0; d_ < d; ++d_)
+        {
+            mean[d_] += load(X, i_perm, b, h, s, d_);
+        }
+    }
+    const float inv = 1.0f / static_cast<float>(bs);
+    for(int d_ = 0; d_ < d; ++d_)
+        mean[d_] *= inv;
+    return mean;
+}
+
+// Compute "sim" flag of one block following SpargeAttn's intent:
+// mean_sim = sum(Gram(x_hat)) / (BS_*BS_), where x_hat are token vectors normalized along D.
+//
+// Important: sum(Gram) = ||sum_i x_hat_i||^2, so we can compute it in O(BS_*D) exactly
+// instead of O(BS_^2 * D).
+template <typename T>
+bool sim_block_flag(const ck_tile::HostTensor<T>& X,
+                    bool i_perm,
+                    int b,
+                    int h,
+                    int s0,
+                    int s1,
+                    int d,
+                    float simthreshd1)
+{
+    const int bs = std::max(0, s1 - s0);
+    if(bs == 0)
+        return false;
+
+    std::vector<float> sum_hat(d, 0.0f);
+
+    for(int s = s0; s < s1; ++s)
+    {
+        // Compute L2 norm over D.
+        float norm2 = 0.0f;
+        for(int d_ = 0; d_ < d; ++d_)
+        {
+            const float v = load(X, i_perm, b, h, s, d_);
+            norm2 += v * v;
+        }
+        float inv_norm = 1.0f;
+        // spargeAttn use eps to prevent division by zero
+        if(norm2 > 0.0f)
+            inv_norm = 1.0f / std::sqrt(norm2);
+
+        // Accumulate normalized vector.
+        for(int d_ = 0; d_ < d; ++d_)
+        {
+            sum_hat[d_] += load(X, i_perm, b, h, s, d_) * inv_norm;
+        }
+    }
+
+    float sum_gram = 0.0f;
+    for(int d_ = 0; d_ < d; ++d_)
+        sum_gram += sum_hat[d_] * sum_hat[d_];
+
+    const float denom    = static_cast<float>(bs) * static_cast<float>(bs);
+    const float mean_sim = sum_gram / denom;
+
+    return mean_sim > simthreshd1;
+}
+
+inline int select_count_from_cdf(const std::vector<float>& sorted_probs, float cdfthreshd)
+{
+    // Choose the smallest n such that cdf[n-1] >= cdfthreshd.
+    // Ensure at least 1.
+    if(sorted_probs.empty())
+        return 0;
+    if(cdfthreshd <= 0.0f)
+        return 1;
+
+    float c = 0.0f;
+    for(int i = 0; i < static_cast<int>(sorted_probs.size()); ++i)
+    {
+        c += sorted_probs[i];
+        if(c >= cdfthreshd)
+            return i + 1;
+    }
+    return static_cast<int>(sorted_probs.size());
+}
+
+inline int select_count_from_topk(int K_blk, float topk)
+{
+    if(K_blk <= 0)
+        return 0;
+    int n = static_cast<int>(std::floor(topk * static_cast<float>(K_blk)));
+    n     = std::max(1, n);
+    return n;
+}
+
+} // namespace detail
+
+// Build one-hot block_map[b,hq,qb,kb] in {0,1}.
+// - No causal mask
+// - No attention sink
+// - Logic matches SpargeAttn's structure:
+//   - score softmax is only over sim_kblocks; ~sim_kblocks are forced ON later
+//   - if a Q-block is not "similar", force the whole row ON
+template <typename T>
+ck_tile::HostTensor<uint8_t> build_block_map_meansim(const ck_tile::HostTensor<T>& Q,
+                                                     const ck_tile::HostTensor<T>& K,
+                                                     const SpargeParams& p)
+{
+    const auto qlens = Q.get_lengths();
+    const auto klens = K.get_lengths();
+
+    const int B  = static_cast<int>(qlens[0]);
+    const int Hq = p.i_perm ? static_cast<int>(qlens[1]) : static_cast<int>(qlens[2]);
+    const int Sq = p.i_perm ? static_cast<int>(qlens[2]) : static_cast<int>(qlens[1]);
+    const int D  = static_cast<int>(qlens[3]);
+
+    [[maybe_unused]] const int Bk = static_cast<int>(klens[0]);
+    const int Hk = p.i_perm ? static_cast<int>(klens[1]) : static_cast<int>(klens[2]);
+    const int Sk = p.i_perm ? static_cast<int>(klens[2]) : static_cast<int>(klens[1]);
+    [[maybe_unused]] const int Dk = static_cast<int>(klens[3]);
+
+    assert(B == Bk && D == Dk && Hq % Hk == 0);
+    assert(p.BLKQ > 0 && p.BLKK > 0);
+
+    const int nhead_ratio_qk = Hq / Hk;
+    const int Q_blk          = ck_tile::integer_divide_ceil(Sq, p.BLKQ);
+    const int K_blk          = ck_tile::integer_divide_ceil(Sk, p.BLKK);
+
+    ck_tile::HostTensor<uint8_t> block_map({B, Hq, Q_blk, K_blk});
+
+    // pooled_q: [B,Hq,Q_blk,D], pooled_k: [B,Hk,K_blk,D]
+    // sim_q: [B,Hq,Q_blk], sim_k: [B,Hk,K_blk]
+    std::vector<float> pooled_q(static_cast<size_t>(B) * Hq * Q_blk * D, 0.0f);
+    std::vector<float> pooled_k(static_cast<size_t>(B) * Hk * K_blk * D, 0.0f);
+    std::vector<uint8_t> sim_q(static_cast<size_t>(B) * Hq * Q_blk, 0);
+    std::vector<uint8_t> sim_k(static_cast<size_t>(B) * Hk * K_blk, 0);
+
+    auto idx_pq = [&](int b, int hq, int qb, int d) {
+        return (((b * Hq + hq) * Q_blk + qb) * D + d);
+    };
+    auto idx_pk = [&](int b, int hk, int kb, int d) {
+        return (((b * Hk + hk) * K_blk + kb) * D + d);
+    };
+    auto idx_sq = [&](int b, int hq, int qb) { return ((b * Hq + hq) * Q_blk + qb); };
+    auto idx_sk = [&](int b, int hk, int kb) { return ((b * Hk + hk) * K_blk + kb); };
+
+    for(int b = 0; b < B; ++b)
+    {
+        for(int hq = 0; hq < Hq; ++hq)
+        {
+            // Q blocks
+            for(int qb = 0; qb < Q_blk; ++qb)
+            {
+                const int s0 = qb * p.BLKQ;
+                const int s1 = std::min(Sq, (qb + 1) * p.BLKQ);
+
+                // pooled mean
+                auto mean = detail::pooled_mean_block(Q, p.i_perm, b, hq, s0, s1, D);
+                for(int d = 0; d < D; ++d)
+                    pooled_q[idx_pq(b, hq, qb, d)] = mean[d];
+
+                // sim flag
+                sim_q[idx_sq(b, hq, qb)] =
+                    detail::sim_block_flag(Q, p.i_perm, b, hq, s0, s1, D, p.simthreshd1) ? 1 : 0;
+            }
+        }
+
+        for(int hk = 0; hk < Hk; ++hk)
+        {
+            // K blocks
+            for(int kb = 0; kb < K_blk; ++kb)
+            {
+                const int s0 = kb * p.BLKK;
+                const int s1 = std::min(Sk, (kb + 1) * p.BLKK);
+
+                auto mean = detail::pooled_mean_block(K, p.i_perm, b, hk, s0, s1, D);
+                for(int d = 0; d < D; ++d)
+                    pooled_k[idx_pk(b, hk, kb, d)] = mean[d];
+
+                sim_k[idx_sk(b, hk, kb)] =
+                    detail::sim_block_flag(K, p.i_perm, b, hk, s0, s1, D, p.simthreshd1) ? 1 : 0;
+            }
+        }
+    }
+
+    const float scale = 1.0f / std::sqrt(static_cast<float>(D));
+
+    // Main loop
+    for(int b = 0; b < B; ++b)
+    {
+        for(int hq = 0; hq < Hq; ++hq)
+        {
+            const int hk = hq / nhead_ratio_qk;
+
+            for(int qb = 0; qb < Q_blk; ++qb)
+            {
+                const bool q_is_sim = (sim_q[idx_sq(b, hq, qb)] != 0);
+
+                // If Q-block is not "similar", force dense row.
+                if(!q_is_sim)
+                {
+                    for(int kb = 0; kb < K_blk; ++kb)
+                        block_map(b, hq, qb, kb) = 1;
+                    continue;
+                }
+
+                // Compute scores over K blocks (only sim_kblocks participate in softmax; others set
+                // to -inf).
+                std::vector<float> score(K_blk, -std::numeric_limits<float>::infinity());
+                for(int kb = 0; kb < K_blk; ++kb)
+                {
+                    const bool k_is_sim = (sim_k[idx_sk(b, hk, kb)] != 0);
+                    if(!k_is_sim)
+                    {
+                        block_map(b, hq, qb, kb) = 1;
+                        continue;
+                    }
+
+                    float dot = 0.0f;
+                    for(int d = 0; d < D; ++d)
+                    {
+                        dot += pooled_q[idx_pq(b, hq, qb, d)] * pooled_k[idx_pk(b, hk, kb, d)];
+                    }
+                    score[kb] = dot * scale;
+                }
+
+                // Softmax over K_blk (numerically stable). If all -inf, probs become all zeros.
+                float maxv = -std::numeric_limits<float>::infinity();
+                for(int kb = 0; kb < K_blk; ++kb)
+                    maxv = std::max(maxv, score[kb]);
+
+                std::vector<float> prob(K_blk, 0.0f);
+                if(std::isfinite(maxv))
+                {
+                    float sumexp = 0.0f;
+                    for(int kb = 0; kb < K_blk; ++kb)
+                    {
+                        if(!std::isfinite(score[kb]))
+                            continue;
+                        const float e = std::exp(score[kb] - maxv);
+                        prob[kb]      = e;
+                        sumexp += e;
+                    }
+                    if(sumexp > 0.0f)
+                    {
+                        const float inv = 1.0f / sumexp;
+                        for(int kb = 0; kb < K_blk; ++kb)
+                            prob[kb] *= inv;
+                    }
+                    else
+                    {
+                        // All exponentials underflowed: keep zeros.
+                        std::fill(prob.begin(), prob.end(), 0.0f);
+                    }
+                }
+
+                // Sort indices by prob descending.
+                std::vector<int> order(K_blk);
+                std::iota(order.begin(), order.end(), 0);
+                std::sort(order.begin(), order.end(), [&](int a, int c) {
+                    if(prob[a] != prob[c])
+                        return prob[a] > prob[c];
+                    return a < c; // tie-breaker for determinism
+                });
+
+                // Determine how many to select.
+                int num_to_select = 0;
+                if(p.topk > 0.0f)
+                {
+                    num_to_select = detail::select_count_from_topk(K_blk, p.topk);
+                }
+                else
+                {
+                    // Use CDF threshold selection (smallest n s.t. cumulative prob >= cdfthreshd).
+                    std::vector<float> sorted_probs(K_blk);
+                    for(int i = 0; i < K_blk; ++i)
+                        sorted_probs[i] = prob[order[i]];
+                    num_to_select = detail::select_count_from_cdf(sorted_probs, p.cdfthreshd);
+                    num_to_select = std::max(1, num_to_select);
+                }
+
+                // Select top-kb blocks by order[0..num_to_select-1].
+                for(int i = 0; i < num_to_select; ++i)
+                {
+                    const int kb             = order[i];
+                    block_map(b, hq, qb, kb) = 1;
+                }
+            }
+        }
+    }
+
+    return block_map;
+}
+
+// Convert one-hot block_map -> delta-encoded LUT + valid_block_num (CK VSA format).
+template <typename MapT>
+VSALut block_map_to_vsa_lut_delta(const ck_tile::HostTensor<MapT>& block_map)
+{
+    const auto lens = block_map.get_lengths();
+    const int B     = static_cast<int>(lens[0]);
+    const int H     = static_cast<int>(lens[1]);
+    const int Q     = static_cast<int>(lens[2]);
+    const int K     = static_cast<int>(lens[3]);
+
+    VSALut out{
+        ck_tile::HostTensor<int32_t>({B, H, Q, K}),
+        ck_tile::HostTensor<int32_t>({B, H, Q}),
+    };
+
+    for(int b = 0; b < B; ++b)
+    {
+        for(int h = 0; h < H; ++h)
+        {
+            for(int q = 0; q < Q; ++q)
+            {
+                int32_t valid = 0;
+                int32_t prev  = 0;
+
+                for(int k = 0; k < K; ++k)
+                {
+                    const bool on = static_cast<int>(block_map(b, h, q, k)) != 0;
+                    if(on)
+                    {
+                        out.lut(b, h, q, valid) = static_cast<int32_t>(k - prev);
+                        prev                    = static_cast<int32_t>(k);
+                        ++valid;
+                    }
+                }
+
+                out.valid_block_num(b, h, q) = valid;
+
+                // Optional: zero-fill the unused tail for determinism.
+                for(int i = valid; i < K; ++i)
+                    out.lut(b, h, q, i) = 0;
+            }
+        }
+    }
+
+    return out;
+}
+
+} // namespace sparge
diff --git a/example/ck_tile/50_sparse_attn/test_sparge.cpp b/example/ck_tile/50_sparse_attn/test_sparge.cpp
new file mode 100644
index 00000000000..4c97a10d0f0
--- /dev/null
+++ b/example/ck_tile/50_sparse_attn/test_sparge.cpp
@@ -0,0 +1,465 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+// Unified test for Sparge pipeline: blockmap generation + sparse attention (Jenga/VSA).
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/reference/reference_blocked_attention.hpp"
+#include "ck_tile/core/utility/bit_cast.hpp"
+
+#include "fmha_fwd_trek.hpp"
+#include "sparge_blockmap_trek.hpp"
+#include "sparge_tool.hpp"
+
+// ============================================================================
+// Helpers
+// ============================================================================
+
+template <typename T>
+ck_tile::HostTensor<T>
+make_qkv_tensor(ck_tile::index_t batch, ck_tile::index_t nhead, ck_tile::index_t seqlen, ck_tile::index_t hdim, bool i_perm)
+{
+    if(i_perm)
+        return ck_tile::HostTensor<T>({batch, nhead, seqlen, hdim});
+    return ck_tile::HostTensor<T>({batch, seqlen, nhead, hdim});
+}
+
+template <typename T>
+ck_tile::HostTensor<T> to_bhsd(const ck_tile::HostTensor<T>& tensor, bool is_bhsd)
+{
+    auto lens               = tensor.get_lengths();
+    ck_tile::index_t batch  = lens[0];
+    ck_tile::index_t seqlen = is_bhsd ? lens[2] : lens[1];
+    ck_tile::index_t nhead  = is_bhsd ? lens[1] : lens[2];
+    ck_tile::index_t hdim   = lens[3];
+
+    ck_tile::HostTensor<T> out({batch, nhead, seqlen, hdim});
+    for(ck_tile::index_t b = 0; b < batch; ++b)
+        for(ck_tile::index_t h = 0; h < nhead; ++h)
+            for(ck_tile::index_t s = 0; s < seqlen; ++s)
+                for(ck_tile::index_t d = 0; d < hdim; ++d)
+                    out(b, h, s, d) = is_bhsd ? tensor(b, h, s, d) : tensor(b, s, h, d);
+    return out;
+}
+
+template <typename T>
+auto get_error_tolerance()
+{
+    double rtol = 1e-2;
+    double atol = 4e-2;
+    if constexpr(std::is_same_v<T, ck_tile::bf16_t>)
+    {
+        atol = 2e-1;
+        rtol = 2e-1;
+    }
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <typename T>
+float to_float_for_compare(T value)
+{
+    return static_cast<float>(value);
+}
+
+template <>
+float to_float_for_compare<ck_tile::bf16_t>(ck_tile::bf16_t value)
+{
+#if CK_TILE_USE_CUSTOM_DATA_TYPE
+    return static_cast<float>(value);
+#else
+    return ck_tile::bf16_to_float_raw(ck_tile::bit_cast<ck_tile::bf16_raw_t>(value));
+#endif
+}
+
+// ============================================================================
+// Arg parser
+// ============================================================================
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser
+        .insert("v", "1", "0:no validation, 1:cpu validation")
+        .insert("pipeline", "jenga", "attention pipeline: jenga / vsa")
+        .insert("b", "1", "batch size")
+        .insert("h", "4", "num of head for q")
+        .insert("h_k", "-1", "num of head for k/v, -1 means equal to h")
+        .insert("s", "4096", "seqlen_q")
+        .insert("s_k", "-1", "seqlen_k, -1 means equal to s")
+        .insert("d", "128", "head dim for q, k")
+        .insert("d_v", "-1", "head dim for v, -1 means equal to d")
+        .insert("topk", "0.3", "topk ratio for blockmap (fraction of K-blocks to keep)")
+        .insert("cdfthreshd", "-1", "CDF threshold for blockmap (overrides topk if >= 0)")
+        .insert("simthreshd1", "0.6", "similarity threshold for blockmap")
+        .insert("prec", "fp16", "data type: fp16/bf16")
+        .insert("iperm", "1", "permute input, 1: b*h*s*d, 0: b*s*h*d")
+        .insert("operm", "1", "permute output")
+        .insert("seed", "42", "random seed")
+        .insert("warmup", "5", "warmup iterations")
+        .insert("repeat", "20", "benchmark iterations")
+        .insert("kname", "0", "print kernel name")
+        .insert("perhead", "0",
+                "R21A Phase 4: 0=scalar (default), 1=per-head [H] superparam test "
+                "(varies topk[h] = topk * (1 + 0.5*(h - H/2)/H), simthreshd1 unchanged)");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// ============================================================================
+// Main test
+// ============================================================================
+template <typename T>
+bool run_test(const ck_tile::ArgParser& arg_parser)
+{
+    int do_validation           = arg_parser.get_int("v");
+    std::string pipeline        = arg_parser.get_str("pipeline");
+    ck_tile::index_t batch      = arg_parser.get_int("b");
+    ck_tile::index_t nhead      = arg_parser.get_int("h");
+    ck_tile::index_t nhead_k    = arg_parser.get_int("h_k");
+    ck_tile::index_t seqlen_q   = arg_parser.get_int("s");
+    ck_tile::index_t seqlen_k   = arg_parser.get_int("s_k");
+    ck_tile::index_t hdim_q     = arg_parser.get_int("d");
+    ck_tile::index_t hdim_v     = arg_parser.get_int("d_v");
+    float topk                  = arg_parser.get_float("topk");
+    float cdfthreshd            = arg_parser.get_float("cdfthreshd");
+    float simthreshd1           = arg_parser.get_float("simthreshd1");
+    bool i_perm                 = arg_parser.get_bool("iperm");
+    bool o_perm                 = arg_parser.get_bool("operm");
+    uint32_t seed               = arg_parser.get_uint32("seed");
+    int warmup                  = arg_parser.get_int("warmup");
+    int repeat                  = arg_parser.get_int("repeat");
+    int kname                   = arg_parser.get_int("kname");
+    int perhead                 = arg_parser.get_int("perhead");
+
+    if(nhead_k < 0) nhead_k = nhead;
+    if(seqlen_k < 0) seqlen_k = seqlen_q;
+    if(hdim_v < 0) hdim_v = hdim_q;
+
+    // If cdfthreshd >= 0, use CDF mode; otherwise use topk mode
+    if(cdfthreshd >= 0.0f)
+        topk = -1.0f;
+
+    constexpr ck_tile::index_t BLKQ = 64;
+    constexpr ck_tile::index_t BLKK = 128;
+
+    if(hdim_q != 128 || hdim_v != 128)
+    {
+        std::cout << "\n>>> TEST SKIPPED <<<\n"
+                  << "Kernel instances are generated for hdim=128 only.\n";
+        return true;
+    }
+
+    ck_tile::index_t num_q_blocks = (seqlen_q + BLKQ - 1) / BLKQ;
+    ck_tile::index_t num_k_blocks = (seqlen_k + BLKK - 1) / BLKK;
+
+    std::string prec_str = std::is_same_v<T, ck_tile::half_t> ? "fp16" : "bf16";
+    std::cout << "[" << pipeline << "|" << prec_str
+              << "] b=" << batch << " h=" << nhead << " s=" << seqlen_q
+              << " d=" << hdim_q << " topk=" << topk
+              << " sim1=" << simthreshd1 << std::flush;
+
+    // ---- allocate host tensors ----
+    auto q_host = make_qkv_tensor<T>(batch, nhead, seqlen_q, hdim_q, i_perm);
+    auto k_host = make_qkv_tensor<T>(batch, nhead_k, seqlen_k, hdim_q, i_perm);
+    auto v_host = make_qkv_tensor<T>(batch, nhead_k, seqlen_k, hdim_v, i_perm);
+    auto output_host = o_perm ? ck_tile::HostTensor<T>({batch, nhead, seqlen_q, hdim_v})
+                              : ck_tile::HostTensor<T>({batch, seqlen_q, nhead, hdim_v});
+
+    ck_tile::HostTensor<uint8_t> block_map_host({batch, nhead, num_q_blocks, num_k_blocks});
+    ck_tile::HostTensor<int32_t> lut_host({batch, nhead, num_q_blocks, num_k_blocks});
+    ck_tile::HostTensor<int32_t> valid_block_num_host({batch, nhead, num_q_blocks});
+
+    ck_tile::FillUniformDistribution<T>{-0.5f, 0.5f, seed}(q_host);
+    ck_tile::FillUniformDistribution<T>{-0.5f, 0.5f, seed + 1}(k_host);
+    ck_tile::FillUniformDistribution<T>{-0.5f, 0.5f, seed + 2}(v_host);
+
+    // ---- device tensors ----
+    ck_tile::DeviceMem q_dev(q_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem k_dev(k_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem v_dev(v_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem o_dev(output_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem block_map_dev(block_map_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem lut_dev(lut_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem valid_bn_dev(valid_block_num_host.get_element_space_size_in_bytes());
+
+    q_dev.ToDevice(q_host.data());
+    k_dev.ToDevice(k_host.data());
+    v_dev.ToDevice(v_host.data());
+    o_dev.SetZero();
+    block_map_dev.SetZero();
+    lut_dev.SetZero();
+    valid_bn_dev.SetZero();
+
+    // ---- strides (BHSD when i_perm=true) ----
+    auto q_strides = q_host.get_strides();
+    auto k_strides = k_host.get_strides();
+    auto v_strides = v_host.get_strides();
+    auto o_strides = output_host.get_strides();
+
+    float scale_s = 1.0f / std::sqrt(static_cast<float>(hdim_q));
+
+    // ---- build blockmap args ----
+    sparge_blockmap_traits bmap_traits;
+    bmap_traits.data_type = std::is_same_v<T, ck_tile::half_t> ? "fp16" : "bf16";
+    bmap_traits.hdim_q    = hdim_q;
+
+    sparge_blockmap_args bmap_args;
+    bmap_args.q_ptr              = q_dev.GetDeviceBuffer();
+    bmap_args.k_ptr              = k_dev.GetDeviceBuffer();
+    bmap_args.batch              = batch;
+    bmap_args.seqlen_q           = seqlen_q;
+    bmap_args.seqlen_k           = seqlen_k;
+    bmap_args.hdim_q             = hdim_q;
+    bmap_args.nhead_q            = nhead;
+    bmap_args.nhead_k            = nhead_k;
+    bmap_args.stride_q           = q_strides[i_perm ? 2 : 1];
+    bmap_args.stride_k           = k_strides[i_perm ? 2 : 1];
+    bmap_args.nhead_stride_q     = q_strides[i_perm ? 1 : 2];
+    bmap_args.nhead_stride_k     = k_strides[i_perm ? 1 : 2];
+    bmap_args.batch_stride_q     = q_strides[0];
+    bmap_args.batch_stride_k     = k_strides[0];
+    bmap_args.simthreshd1        = simthreshd1;
+    bmap_args.cdfthreshd         = (topk < 0.0f) ? cdfthreshd : -1.0f;
+    bmap_args.topk               = topk;
+    bmap_args.scale              = scale_s;
+    bmap_args.block_map_ptr      = block_map_dev.GetDeviceBuffer();
+    bmap_args.lut_ptr            = (pipeline == "vsa") ? lut_dev.GetDeviceBuffer() : nullptr;
+    bmap_args.valid_block_num_ptr = (pipeline == "vsa") ? valid_bn_dev.GetDeviceBuffer() : nullptr;
+
+    // R21A Phase 4 + R21B fix: per-head superparam buffers, all sized [nhead_q]
+    // to match SpargeAttn upstream contract (utils.py:324-328, Headnum=q.size(1)).
+    // K-side kernel reads only the first nhead_k entries via [hk].
+    ck_tile::DeviceMem topk_per_head_dev(static_cast<size_t>(nhead) * sizeof(float));
+    ck_tile::DeviceMem sim1_per_head_dev(static_cast<size_t>(nhead) * sizeof(float));
+    ck_tile::DeviceMem cdf_per_head_dev (static_cast<size_t>(nhead) * sizeof(float));
+    if(perhead != 0)
+    {
+        std::vector<float> topk_h(nhead);
+        std::vector<float> sim1_h(nhead);
+        std::vector<float> cdf_h (nhead);
+        for(int h = 0; h < nhead; ++h)
+        {
+            // small per-head jitter around scalar topk so sparsity differs by head
+            const float jitter = 0.5f * (static_cast<float>(h - nhead / 2) / nhead);
+            topk_h[h]          = topk * (1.0f + jitter);
+            sim1_h[h]          = simthreshd1; // bit-identical to scalar (kernel reads [0..nhead_k-1])
+            cdf_h[h]           = cdfthreshd;
+        }
+        topk_per_head_dev.ToDevice(topk_h.data());
+        sim1_per_head_dev.ToDevice(sim1_h.data());
+        cdf_per_head_dev .ToDevice(cdf_h.data());
+        bmap_args.topk_per_head_ptr        = static_cast<const float*>(topk_per_head_dev.GetDeviceBuffer());
+        bmap_args.simthreshd1_per_head_ptr = static_cast<const float*>(sim1_per_head_dev.GetDeviceBuffer());
+        bmap_args.cdfthreshd_per_head_ptr  = static_cast<const float*>(cdf_per_head_dev.GetDeviceBuffer());
+    }
+
+    // ---- build attention args ----
+    ck_tile::stream_config stream_cfg;
+    stream_cfg.stream_id_  = nullptr;
+    stream_cfg.time_kernel_ = true;
+    stream_cfg.log_level_  = kname;
+    stream_cfg.cold_niters_ = warmup;
+    stream_cfg.nrepeat_    = repeat;
+
+    float avg_ms = -1.0f;
+
+    if(pipeline == "jenga")
+    {
+        fmha_jenga_fwd_traits attn_traits;
+        attn_traits.hdim_q        = hdim_q;
+        attn_traits.hdim_v        = hdim_v;
+        attn_traits.data_type     = std::is_same_v<T, ck_tile::half_t> ? "fp16" : "bf16";
+        attn_traits.is_v_rowmajor = true;
+        attn_traits.mask_type     = mask_enum::no_mask;
+        attn_traits.bm0           = BLKQ;
+
+        fmha_jenga_fwd_args attn_args;
+        attn_args.q_ptr                    = q_dev.GetDeviceBuffer();
+        attn_args.k_ptr                    = k_dev.GetDeviceBuffer();
+        attn_args.v_ptr                    = v_dev.GetDeviceBuffer();
+        attn_args.block_relation_onehot_ptr = block_map_dev.GetDeviceBuffer();
+        attn_args.o_ptr                    = o_dev.GetDeviceBuffer();
+        attn_args.seqlen_q     = seqlen_q;
+        attn_args.seqlen_k     = seqlen_k;
+        attn_args.batch        = batch;
+        attn_args.max_seqlen_q = seqlen_q;
+        attn_args.hdim_q       = hdim_q;
+        attn_args.hdim_v       = hdim_v;
+        attn_args.nhead_q      = nhead;
+        attn_args.nhead_k      = nhead_k;
+        attn_args.scale_s      = scale_s;
+        attn_args.stride_q       = q_strides[i_perm ? 2 : 1];
+        attn_args.stride_k       = k_strides[i_perm ? 2 : 1];
+        attn_args.stride_v       = v_strides[i_perm ? 2 : 1];
+        attn_args.stride_o       = o_strides[o_perm ? 2 : 1];
+        attn_args.nhead_stride_q = q_strides[i_perm ? 1 : 2];
+        attn_args.nhead_stride_k = k_strides[i_perm ? 1 : 2];
+        attn_args.nhead_stride_v = v_strides[i_perm ? 1 : 2];
+        attn_args.nhead_stride_o = o_strides[o_perm ? 1 : 2];
+        attn_args.batch_stride_q = q_strides[0];
+        attn_args.batch_stride_k = k_strides[0];
+        attn_args.batch_stride_v = v_strides[0];
+        attn_args.batch_stride_o = o_strides[0];
+        attn_args.window_size_left  = -1;
+        attn_args.window_size_right = -1;
+        attn_args.mask_type         = 0;
+
+        avg_ms = sparge_jenga_fwd(bmap_traits, bmap_args, attn_traits, attn_args, stream_cfg);
+    }
+    else if(pipeline == "vsa")
+    {
+        fmha_vsa_fwd_traits attn_traits;
+        attn_traits.hdim_q        = hdim_q;
+        attn_traits.hdim_v        = hdim_v;
+        attn_traits.data_type     = std::is_same_v<T, ck_tile::half_t> ? "fp16" : "bf16";
+        attn_traits.is_v_rowmajor = true;
+        attn_traits.mask_type     = mask_enum::no_mask;
+        attn_traits.bm0           = BLKQ;
+
+        fmha_vsa_fwd_args attn_args;
+        attn_args.q_ptr              = q_dev.GetDeviceBuffer();
+        attn_args.k_ptr              = k_dev.GetDeviceBuffer();
+        attn_args.v_ptr              = v_dev.GetDeviceBuffer();
+        attn_args.lut_ptr            = lut_dev.GetDeviceBuffer();
+        attn_args.valid_block_num_ptr = valid_bn_dev.GetDeviceBuffer();
+        attn_args.o_ptr              = o_dev.GetDeviceBuffer();
+        attn_args.seqlen_q     = seqlen_q;
+        attn_args.seqlen_k     = seqlen_k;
+        attn_args.batch        = batch;
+        attn_args.max_seqlen_q = seqlen_q;
+        attn_args.hdim_q       = hdim_q;
+        attn_args.hdim_v       = hdim_v;
+        attn_args.nhead_q      = nhead;
+        attn_args.nhead_k      = nhead_k;
+        attn_args.scale_s      = scale_s;
+        attn_args.stride_q       = q_strides[i_perm ? 2 : 1];
+        attn_args.stride_k       = k_strides[i_perm ? 2 : 1];
+        attn_args.stride_v       = v_strides[i_perm ? 2 : 1];
+        attn_args.stride_o       = o_strides[o_perm ? 2 : 1];
+        attn_args.nhead_stride_q = q_strides[i_perm ? 1 : 2];
+        attn_args.nhead_stride_k = k_strides[i_perm ? 1 : 2];
+        attn_args.nhead_stride_v = v_strides[i_perm ? 1 : 2];
+        attn_args.nhead_stride_o = o_strides[o_perm ? 1 : 2];
+        attn_args.batch_stride_q = q_strides[0];
+        attn_args.batch_stride_k = k_strides[0];
+        attn_args.batch_stride_v = v_strides[0];
+        attn_args.batch_stride_o = o_strides[0];
+        attn_args.window_size_left  = -1;
+        attn_args.window_size_right = -1;
+        attn_args.mask_type         = 0;
+
+        avg_ms = sparge_vsa_fwd_combined(bmap_traits, bmap_args, attn_traits, attn_args, stream_cfg);
+    }
+    else
+    {
+        std::cerr << "Unknown pipeline: " << pipeline << " (use jenga or vsa)\n";
+        return false;
+    }
+
+    // ---- TFLOPS calculation (dense FMHA formula, so sparsity gains show as higher TFLOPS) ----
+    std::size_t flop = static_cast<std::size_t>(batch) * nhead *
+        (static_cast<std::size_t>(2) * seqlen_q * seqlen_k * hdim_q +
+         static_cast<std::size_t>(2) * seqlen_q * seqlen_k * hdim_v);
+    float tflops = (avg_ms > 0.f) ? static_cast<float>(flop) / 1.E9f / avg_ms : 0.f;
+
+    if(avg_ms > 0.f)
+    {
+        std::cout << std::fixed << ", " << std::setprecision(3) << avg_ms << " ms, "
+                  << std::setprecision(2) << tflops << " TFlops" << std::flush;
+    }
+
+    // ---- copy results back ----
+    o_dev.FromDevice(output_host.data());
+    block_map_dev.FromDevice(block_map_host.data());
+
+    // ---- count active blocks ----
+    ck_tile::index_t total_blocks = batch * nhead * num_q_blocks * num_k_blocks;
+    ck_tile::index_t active_blocks = 0;
+    for(size_t i = 0; i < block_map_host.mData.size(); ++i)
+        if(block_map_host.mData[i])
+            active_blocks++;
+    float actual_sparsity = 1.0f - static_cast<float>(active_blocks) / static_cast<float>(total_blocks);
+    std::cout << ", sparsity=" << std::setprecision(2) << actual_sparsity
+              << "(" << active_blocks << "/" << total_blocks << ")" << std::flush;
+
+    // ---- validation ----
+    bool pass = true;
+    if(do_validation)
+    {
+        auto q_ref = to_bhsd(q_host, i_perm);
+        auto k_ref = to_bhsd(k_host, i_perm);
+        auto v_ref = to_bhsd(v_host, i_perm);
+
+        ck_tile::HostTensor<T> output_ref({batch, nhead, seqlen_q, hdim_v});
+        ck_tile::reference_blocked_attention<T, uint8_t>(
+            q_ref, k_ref, v_ref, block_map_host, output_ref, BLKQ, BLKK, scale_s);
+
+        auto [rtol, atol] = get_error_tolerance<T>();
+
+        float max_diff     = 0.0f;
+        size_t num_errors  = 0;
+
+        auto output_host_bhsd = to_bhsd(output_host, o_perm);
+        for(size_t i = 0; i < output_host_bhsd.mData.size(); ++i)
+        {
+            float gpu_val  = to_float_for_compare(output_host_bhsd.mData[i]);
+            float ref_val  = to_float_for_compare(output_ref.mData[i]);
+            float diff     = std::abs(gpu_val - ref_val);
+            float rel_diff = (std::abs(ref_val) > 1e-6f) ? diff / std::abs(ref_val) : diff;
+
+            max_diff = std::max(max_diff, diff);
+
+            if(diff > atol && rel_diff > rtol)
+                num_errors++;
+        }
+
+        pass = (num_errors == 0);
+        std::cout << ", " << (pass ? "PASS" : "FAIL")
+                  << "(err=" << num_errors << "/" << output_host_bhsd.mData.size()
+                  << " maxdiff=" << max_diff << ")";
+    }
+
+    std::cout << std::endl;
+    return pass;
+}
+
+// ============================================================================
+// Main
+// ============================================================================
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+    {
+        std::cerr << "Failed to parse arguments\n";
+        return -1;
+    }
+
+    std::string prec = arg_parser.get_str("prec");
+
+    bool test_result = false;
+    if(prec == "fp16")
+    {
+        test_result = run_test<ck_tile::half_t>(arg_parser);
+    }
+    else if(prec == "bf16")
+    {
+        test_result = run_test<ck_tile::bf16_t>(arg_parser);
+    }
+    else
+    {
+        std::cerr << "Unsupported precision: " << prec << "\n";
+        return -1;
+    }
+
+    return test_result ? 0 : -1;
+}
diff --git a/include/ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp b/include/ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp
new file mode 100644
index 00000000000..62b5b3591c0
--- /dev/null
+++ b/include/ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp
@@ -0,0 +1,233 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename Pipeline_>
+struct SpargeBlockMapKernel
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+
+    static constexpr index_t kBlockSize  = Pipeline::kBlockSize;
+    static constexpr index_t kBlockPerCu = Pipeline::kBlockPerCu;
+
+    using QDataType = typename Pipeline::QDataType;
+    using KDataType = typename Pipeline::KDataType;
+
+    static constexpr index_t kM0 = Pipeline::kM0;
+    static constexpr index_t kN0 = Pipeline::kN0;
+    static constexpr index_t D   = Pipeline::D;
+
+    static constexpr index_t kAlignment = 16 / sizeof(QDataType);
+
+    struct Kargs
+    {
+        const void* q_ptr;
+        const void* k_ptr;
+
+        index_t seqlen_q;
+        index_t seqlen_k;
+        index_t hdim_q;
+
+        index_t nhead_q;
+        index_t nhead_ratio_qk;
+
+        index_t stride_q;
+        index_t stride_k;
+        index_t nhead_stride_q;
+        index_t nhead_stride_k;
+        index_t batch_stride_q;
+        index_t batch_stride_k;
+
+        float simthreshd1;
+        float cdfthreshd;
+        float topk;
+        float scale;
+
+        void* block_map_ptr;
+        void* lut_ptr;
+        void* valid_block_num_ptr;
+
+        // R20 K-stat workspace from Kernel A
+        const void* pooled_k_ws_ptr; // [batch, nhead_k, N_k, D] fp32
+        const void* sim_k_ws_ptr;    // [batch, nhead_k, N_k] uint8
+
+        index_t N_k;
+
+        // R21A Phase 4: optional per-head topk (size = nhead_q floats).
+        // nullptr => use scalar `topk` for all heads.
+        const float* topk_per_head;
+
+        // R21B: optional per-head cdfthreshd (size = nhead_q floats).
+        // nullptr => use scalar `cdfthreshd` for all heads.
+        // Only consulted on topk<=0 path; bench currently always uses topk path.
+        const float* cdfthreshd_per_head;
+    };
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const void* q_ptr,
+                                                 const void* k_ptr,
+                                                 index_t seqlen_q,
+                                                 index_t seqlen_k,
+                                                 index_t hdim_q,
+                                                 index_t nhead_q,
+                                                 index_t nhead_ratio_qk,
+                                                 index_t stride_q,
+                                                 index_t stride_k,
+                                                 index_t nhead_stride_q,
+                                                 index_t nhead_stride_k,
+                                                 index_t batch_stride_q,
+                                                 index_t batch_stride_k,
+                                                 float simthreshd1,
+                                                 float cdfthreshd,
+                                                 float topk,
+                                                 float scale,
+                                                 void* block_map_ptr,
+                                                 void* lut_ptr,
+                                                 void* valid_block_num_ptr,
+                                                 const void* pooled_k_ws_ptr,
+                                                 const void* sim_k_ws_ptr,
+                                                 const float* topk_per_head       = nullptr,
+                                                 const float* cdfthreshd_per_head = nullptr)
+    {
+        const index_t N_k = integer_divide_ceil(seqlen_k, kN0);
+        return Kargs{q_ptr,
+                     k_ptr,
+                     seqlen_q,
+                     seqlen_k,
+                     hdim_q,
+                     nhead_q,
+                     nhead_ratio_qk,
+                     stride_q,
+                     stride_k,
+                     nhead_stride_q,
+                     nhead_stride_k,
+                     batch_stride_q,
+                     batch_stride_k,
+                     simthreshd1,
+                     cdfthreshd,
+                     topk,
+                     scale,
+                     block_map_ptr,
+                     lut_ptr,
+                     valid_block_num_ptr,
+                     pooled_k_ws_ptr,
+                     sim_k_ws_ptr,
+                     N_k,
+                     topk_per_head,
+                     cdfthreshd_per_head};
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(index_t batch, index_t nhead_q, index_t seqlen_q)
+    {
+        const index_t Q_blk = integer_divide_ceil(seqlen_q, kM0);
+        return dim3(Q_blk, nhead_q, batch);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        const index_t qb = static_cast<index_t>(blockIdx.x);
+        const index_t hq = static_cast<index_t>(blockIdx.y);
+        const index_t b  = static_cast<index_t>(blockIdx.z);
+
+        const index_t hk = hq / kargs.nhead_ratio_qk;
+
+        // Q pointer for this (batch, head, q_block)
+        const auto* q_base = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
+                             b * kargs.batch_stride_q + hq * kargs.nhead_stride_q +
+                             qb * kM0 * kargs.stride_q;
+
+        // K pointer for this (batch, head_k)
+        const auto* k_base = reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+                             b * kargs.batch_stride_k + hk * kargs.nhead_stride_k;
+
+        // Q DRAM view with OOB padding
+        const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+            q_base,
+            make_tuple(kargs.seqlen_q - qb * kM0, D),
+            make_tuple(kargs.stride_q, 1),
+            number<kAlignment>{},
+            number<1>{});
+        const auto q_dram = pad_tensor_view(
+            q_dram_naive, make_tuple(number<kM0>{}, number<D>{}), sequence<true, false>{});
+
+        auto q_window = make_tile_window(q_dram,
+                                         make_tuple(number<kM0>{}, number<D>{}),
+                                         {0, 0},
+                                         Pipeline::MakeQBlockDistribution());
+
+        // K DRAM view with OOB padding
+        const auto k_dram_naive =
+            make_naive_tensor_view<address_space_enum::global>(k_base,
+                                                               make_tuple(kargs.seqlen_k, D),
+                                                               make_tuple(kargs.stride_k, 1),
+                                                               number<kAlignment>{},
+                                                               number<1>{});
+        const auto k_dram = pad_tensor_view(
+            k_dram_naive, make_tuple(number<kN0>{}, number<D>{}), sequence<true, false>{});
+
+        auto k_window = make_tile_window(k_dram,
+                                         make_tuple(number<kN0>{}, number<D>{}),
+                                         {0, 0},
+                                         Pipeline::MakeKBlockDistribution());
+
+        // Output pointers for this (batch, head, q_block)
+        const index_t N_k = kargs.N_k;
+        const index_t bmap_offset =
+            (b * kargs.nhead_q + hq) * integer_divide_ceil(kargs.seqlen_q, kM0) * N_k + qb * N_k;
+        auto* bmap_ptr = reinterpret_cast<uint8_t*>(kargs.block_map_ptr) + bmap_offset;
+
+        int32_t* lut_out   = nullptr;
+        int32_t* valid_out = nullptr;
+        if(kargs.lut_ptr != nullptr)
+        {
+            lut_out = reinterpret_cast<int32_t*>(kargs.lut_ptr) + bmap_offset;
+            const index_t valid_offset =
+                (b * kargs.nhead_q + hq) * integer_divide_ceil(kargs.seqlen_q, kM0) + qb;
+            valid_out = reinterpret_cast<int32_t*>(kargs.valid_block_num_ptr) + valid_offset;
+        }
+
+        // Shared memory
+        __shared__ char smem[Pipeline::GetSmemSize()];
+
+        // R20 K-stat workspace: pre-offset for this (b, hk).
+        const index_t nhead_k = kargs.nhead_q / kargs.nhead_ratio_qk;
+        const index_t khead_off = (b * nhead_k + hk) * N_k;
+        const auto* pooled_k_ws =
+            reinterpret_cast<const float*>(kargs.pooled_k_ws_ptr) + khead_off * D;
+        const auto* sim_k_ws =
+            reinterpret_cast<const uint8_t*>(kargs.sim_k_ws_ptr) + khead_off;
+
+        // R21A Phase 4: per-head topk if provided, else scalar broadcast.
+        const float topk_eff =
+            (kargs.topk_per_head != nullptr) ? kargs.topk_per_head[hq] : kargs.topk;
+        // R21B: per-head cdfthreshd if provided, else scalar broadcast.
+        const float cdfthreshd_eff =
+            (kargs.cdfthreshd_per_head != nullptr) ? kargs.cdfthreshd_per_head[hq] : kargs.cdfthreshd;
+
+        Pipeline{}(q_window,
+                   k_window,
+                   kargs.seqlen_q,
+                   kargs.seqlen_k,
+                   qb,
+                   N_k,
+                   kargs.nhead_ratio_qk,
+                   kargs.simthreshd1,
+                   cdfthreshd_eff,
+                   topk_eff,
+                   kargs.scale,
+                   bmap_ptr,
+                   lut_out,
+                   valid_out,
+                   pooled_k_ws,
+                   sim_k_ws,
+                   static_cast<void*>(smem));
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/sparse_attn/kernel/sparge_kstats_kernel.hpp b/include/ck_tile/ops/sparse_attn/kernel/sparge_kstats_kernel.hpp
new file mode 100644
index 00000000000..3ce494f8702
--- /dev/null
+++ b/include/ck_tile/ops/sparse_attn/kernel/sparge_kstats_kernel.hpp
@@ -0,0 +1,136 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include <type_traits>
+
+namespace ck_tile {
+
+// Kernel A wrapper: grid (N_k, nhead_k, batch). Each work-group precomputes
+// K-block stats (pooled_k_mean[D], sim_k) for one (b, hk, kb) into a workspace
+// that Kernel B (block_map) reads instead of recomputing per Q-block.
+template <typename Pipeline_>
+struct SpargeKStatsKernel
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+
+    static constexpr index_t kBlockSize  = Pipeline::kBlockSize;
+    static constexpr index_t kBlockPerCu = Pipeline::kBlockPerCu;
+
+    using QDataType = typename Pipeline::QDataType;
+    using KDataType = typename Pipeline::KDataType;
+
+    static constexpr index_t kN0 = Pipeline::kN0;
+    static constexpr index_t D   = Pipeline::D;
+
+    static constexpr index_t kAlignment = 16 / sizeof(KDataType);
+
+    struct Kargs
+    {
+        const void* k_ptr;
+
+        index_t seqlen_k;
+        index_t hdim_q;
+        index_t nhead_k;
+
+        index_t stride_k;
+        index_t nhead_stride_k;
+        index_t batch_stride_k;
+
+        float simthreshd1;
+
+        void* pooled_k_ptr; // [batch, nhead_k, N_k, D] fp32
+        void* sim_k_ptr;    // [batch, nhead_k, N_k] uint8
+
+        index_t N_k;
+
+        // R21A Phase 4 + R21B fix: optional per-head simthreshd1.
+        // Buffer is sized [nhead_q] floats to match SpargeAttn upstream contract
+        // (utils.py:324, Headnum=q.size(1)). Kernel only indexes the first
+        // nhead_k entries via [hk]. nullptr => use scalar `simthreshd1`.
+        const float* simthreshd1_per_head;
+    };
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const void* k_ptr,
+                                                 index_t seqlen_k,
+                                                 index_t hdim_q,
+                                                 index_t nhead_k,
+                                                 index_t stride_k,
+                                                 index_t nhead_stride_k,
+                                                 index_t batch_stride_k,
+                                                 float simthreshd1,
+                                                 void* pooled_k_ptr,
+                                                 void* sim_k_ptr,
+                                                 const float* simthreshd1_per_head = nullptr)
+    {
+        const index_t N_k = integer_divide_ceil(seqlen_k, kN0);
+        return Kargs{k_ptr,
+                     seqlen_k,
+                     hdim_q,
+                     nhead_k,
+                     stride_k,
+                     nhead_stride_k,
+                     batch_stride_k,
+                     simthreshd1,
+                     pooled_k_ptr,
+                     sim_k_ptr,
+                     N_k,
+                     simthreshd1_per_head};
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(index_t batch, index_t nhead_k, index_t seqlen_k)
+    {
+        const index_t N_k = integer_divide_ceil(seqlen_k, kN0);
+        return dim3(N_k, nhead_k, batch);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        const index_t kb = static_cast<index_t>(blockIdx.x);
+        const index_t hk = static_cast<index_t>(blockIdx.y);
+        const index_t b  = static_cast<index_t>(blockIdx.z);
+
+        const auto* k_base = reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+                             b * kargs.batch_stride_k + hk * kargs.nhead_stride_k +
+                             kb * kN0 * kargs.stride_k;
+
+        const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+            k_base,
+            make_tuple(kargs.seqlen_k - kb * kN0, D),
+            make_tuple(kargs.stride_k, 1),
+            number<kAlignment>{},
+            number<1>{});
+        const auto k_dram = pad_tensor_view(
+            k_dram_naive, make_tuple(number<kN0>{}, number<D>{}), sequence<true, false>{});
+
+        auto k_window = make_tile_window(k_dram,
+                                         make_tuple(number<kN0>{}, number<D>{}),
+                                         {0, 0},
+                                         Pipeline::MakeKBlockDistribution());
+
+        const index_t N_k         = kargs.N_k;
+        const index_t khead_off   = (b * kargs.nhead_k + hk) * N_k;
+        auto* pooled_k_out = reinterpret_cast<float*>(kargs.pooled_k_ptr) + (khead_off + kb) * D;
+        auto* sim_k_out    = reinterpret_cast<uint8_t*>(kargs.sim_k_ptr) + (khead_off + kb);
+
+        __shared__ char smem[Pipeline::GetSmemSize()];
+
+        // R21A Phase 4: per-head simthreshd1 if provided, else scalar broadcast.
+        const float simthreshd1_eff = (kargs.simthreshd1_per_head != nullptr)
+                                          ? kargs.simthreshd1_per_head[hk]
+                                          : kargs.simthreshd1;
+
+        Pipeline{}(k_window,
+                   kargs.seqlen_k,
+                   kb,
+                   simthreshd1_eff,
+                   pooled_k_out,
+                   sim_k_out,
+                   static_cast<void*>(smem));
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_jenga.hpp b/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_jenga.hpp
index 67936c4353f..9fe8b365b00 100644
--- a/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_jenga.hpp
+++ b/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_jenga.hpp
@@ -318,26 +318,26 @@ struct BlockFmhaPipelineQRKSVSAsyncJenga
         {
             if(!block_relation_onehot[i_total_loops])
             {
-                i_total_loops++;
-                if(i_total_loops < num_total_loop)
-                {
-                    // move K tile windows
-                    move_tile_window(k_dram_block_window, {kN0, 0});
-                    k_dram_window.set_window_origin(k_dram_block_window.get_window_origin());
-
-                    if(block_relation_onehot[i_total_loops])
-                    {
-                        async_load_tile_raw(k_lds_store(LdsSeq.at(number<0>{})),
-                                            k_dram_window,
-                                            number<-1>{},
-                                            k_oob_ck,
-                                            k_pre_np);
-                    }
-                    move_tile_window(k_dram_window, {0, kK0});
-                    move_tile_window(v_dram_window, {0, kN0});
-                    continue;
-                }
-                break;
+                // scan-ahead: find the next active block in one shot
+                index_t next = i_total_loops + 1;
+                while(next < num_total_loop && !block_relation_onehot[next])
+                    next++;
+                if(next >= num_total_loop)
+                    break;
+                const index_t delta = next - i_total_loops;
+                i_total_loops       = next;
+                // jump K/V windows to the next active block
+                move_tile_window(k_dram_block_window, {kN0 * delta, 0});
+                k_dram_window.set_window_origin(k_dram_block_window.get_window_origin());
+                move_tile_window(v_dram_window, {0, kN0 * delta});
+                // immediately prefetch the active K tile
+                async_load_tile_raw(k_lds_store(LdsSeq.at(number<0>{})),
+                                    k_dram_window,
+                                    number<-1>{},
+                                    k_oob_ck,
+                                    k_pre_np);
+                move_tile_window(k_dram_window, {0, kK0});
+                continue;
             }
 
             // STAGE 1, QK gemm
diff --git a/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_vsa.hpp b/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_vsa.hpp
index 2b097ae5827..578ad7e6039 100644
--- a/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_vsa.hpp
+++ b/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_vsa.hpp
@@ -200,7 +200,7 @@ struct BlockFmhaPipelineQRKSVSAsyncVSA
         constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
         constexpr auto gemm_1 = Policy::template GetKVBlockGemm<Problem>();
 
-        int seqlen_k_start = kv_block_idx_ptr[0] * kM0;
+        int seqlen_k_start = kv_block_idx_ptr[0] * kN0;
         auto q_dram_window = make_tile_window(q_dram_block_window_tmp.get_bottom_tensor_view(),
                                               q_dram_block_window_tmp.get_window_lengths(),
                                               q_dram_block_window_tmp.get_window_origin(),
diff --git a/include/ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp b/include/ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp
new file mode 100644
index 00000000000..25e3b964e93
--- /dev/null
+++ b/include/ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp
@@ -0,0 +1,545 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+
+namespace ck_tile {
+
+template <typename Problem_>
+struct SpargeBlockMapPipeline
+{
+    using Problem        = remove_cvref_t<Problem_>;
+    using QDataType      = remove_cvref_t<typename Problem::QDataType>;
+    using KDataType      = remove_cvref_t<typename Problem::KDataType>;
+    using BlockFmhaShape = remove_cvref_t<typename Problem::BlockFmhaShape>;
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+    static constexpr index_t kM0        = BlockFmhaShape::kM0;
+    static constexpr index_t kN0        = BlockFmhaShape::kN0;
+    static constexpr index_t D          = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t NumWarps   = BlockFmhaShape::NumWarps;
+    static constexpr index_t WarpSize   = get_warp_size();
+
+    static constexpr index_t KPerThread       = 16 / sizeof(QDataType);
+    static constexpr index_t KThreads         = D / KPerThread;
+    static constexpr index_t SeqThreadPerWarp = WarpSize / KThreads;
+    static constexpr index_t MPerThread       = kM0 / (SeqThreadPerWarp * NumWarps);
+    static constexpr index_t NPerThread       = kN0 / (SeqThreadPerWarp * NumWarps);
+
+    static constexpr index_t kBlockPerCu = 1;
+    static constexpr index_t kMaxKBlocks = 1024;
+
+    // LDS layout (non-overlapping, all used simultaneously in Phase 2):
+    //   [0 .. kReduceBytes)                       cross-warp reduction scratch slab 0
+    //   [kReduceBytes .. 2*kReduceBytes)          cross-warp reduction scratch slab 1
+    //                                             (Round 8 b1: ping-pong for K-loop double buffer)
+    //   [kScoreOffset ..)                         scores[N_k]
+    //   [kBmapOffset  ..)                         block_map[N_k]
+    //   [kSmallOffset ..)                         Phase 3 argmax scratch (2*NumWarps floats)
+    // B2.v3 column-stride pad: replace k_idx*KPerThread with k_idx*(KPerThread+1)
+    // to break the 4-way intra-warp bank conflict. New per-warp slab size:
+    // KThreads * (KPerThread + 1) floats.
+    static constexpr index_t kColPaddedStride  = KPerThread + 1;
+    static constexpr index_t kPerWarpFloats    = KThreads * kColPaddedStride;
+    static constexpr index_t kReduceBytes      = NumWarps * kPerWarpFloats * sizeof(float);
+    static constexpr index_t kReduceTotalBytes = 2 * kReduceBytes; // Round 8 b1: 2 slabs
+    static constexpr index_t kScoreOffset      = kReduceTotalBytes;
+    static constexpr index_t kBmapOffset       = kScoreOffset + kMaxKBlocks * sizeof(float);
+    static constexpr index_t kSmallOffset      = kBmapOffset + kMaxKBlocks * sizeof(uint8_t);
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return kSmallOffset + 2 * NumWarps * sizeof(float);
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQBlockDistribution()
+    {
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<MPerThread, NumWarps, SeqThreadPerWarp>,
+                                             sequence<KThreads, KPerThread>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKBlockDistribution()
+    {
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<NPerThread, NumWarps, SeqThreadPerWarp>,
+                                             sequence<KThreads, KPerThread>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    // Extract tile data into a local float array via static_for (compile-time indices).
+    template <index_t BufSize, typename Tile>
+    CK_TILE_DEVICE static void tile_to_float(const Tile& tile, float (&out)[BufSize])
+    {
+        static_assert(Tile::get_thread_buffer_size() == BufSize);
+        const auto& buf = tile.get_thread_buffer();
+        static_for<0, BufSize, 1>{}([&](auto i) { out[i.value] = type_convert<float>(buf[i]); });
+    }
+
+    // Column-wise (dim=0) sum: accumulate SeqPerThread rows into KPerThread partial sums,
+    // then xor-shuffle across m_idx within warp.
+    template <index_t SeqPerThread>
+    CK_TILE_DEVICE static void column_reduce_thread_and_warp(const float* __restrict__ data,
+                                                             float (&col_acc)[KPerThread])
+    {
+        for(index_t k = 0; k < KPerThread; ++k)
+            col_acc[k] = 0.f;
+
+        for(index_t m = 0; m < SeqPerThread; ++m)
+            for(index_t k = 0; k < KPerThread; ++k)
+                col_acc[k] += data[m * KPerThread + k];
+
+        for(index_t stride = KThreads; stride < WarpSize; stride *= 2)
+            for(index_t k = 0; k < KPerThread; ++k)
+                col_acc[k] += warp_shuffle(col_acc[k], __lane_id() ^ stride);
+    }
+
+    // Cross-warp LDS reduction for column sums.
+    // Round 13f: templated TrailingSync flag. When false, the trailing __syncthreads()
+    // is dropped — only safe when the next access targets a *different* slab and the
+    // intervening work does not read smem_reduce. Used at the slab_b call in Phase 2
+    // K-loop, where the next iter's first cross-warp reduce writes to slab_a (different
+    // address) and is preceded by its own leading sync.
+    template <bool TrailingSync = true>
+    CK_TILE_DEVICE static void column_reduce_cross_warp(float (&col_acc)[KPerThread],
+                                                        float* __restrict__ smem_reduce)
+    {
+        const index_t tid     = static_cast<index_t>(threadIdx.x);
+        const index_t warp_id = tid / WarpSize;
+        const index_t lane_id = tid % WarpSize;
+        const index_t k_idx   = lane_id % KThreads;
+        const index_t m_idx   = lane_id / KThreads;
+
+        // B2.v3 column-stride pad: stride k_idx by (KPerThread+1)=9 instead of 8,
+        // changing per-lane bank from (k_idx*8+k)%32 to (k_idx*9+k)%32. For k=0,
+        // lanes (k_idx={0,4,8,12}) now hit banks {0,4,8,12} instead of all 0.
+        if(m_idx == 0)
+            for(index_t k = 0; k < KPerThread; ++k)
+                smem_reduce[warp_id * kPerWarpFloats + k_idx * kColPaddedStride + k] = col_acc[k];
+        __syncthreads();
+
+        for(index_t k = 0; k < KPerThread; ++k)
+            col_acc[k] = 0.f;
+        for(index_t w = 0; w < NumWarps; ++w)
+            for(index_t k = 0; k < KPerThread; ++k)
+                col_acc[k] += smem_reduce[w * kPerWarpFloats + k_idx * kColPaddedStride + k];
+        if constexpr(TrailingSync)
+            __syncthreads();
+    }
+
+    // Compute ||v||^2 per row: sum along KPerThread then xor-shuffle across k_idx.
+    template <index_t SeqPerThread>
+    CK_TILE_DEVICE static void row_reduce_sq_norm(const float* __restrict__ data,
+                                                  float (&row_norms)[SeqPerThread],
+                                                  index_t actual_seq)
+    {
+        const index_t tid     = static_cast<index_t>(threadIdx.x);
+        const index_t warp_id = tid / WarpSize;
+        const index_t m_idx   = (tid % WarpSize) / KThreads;
+
+        for(index_t m = 0; m < SeqPerThread; ++m)
+        {
+            float sq = 0.f;
+            for(index_t k = 0; k < KPerThread; ++k)
+            {
+                float v = data[m * KPerThread + k];
+                sq += v * v;
+            }
+            for(index_t stride = 1; stride < KThreads; stride *= 2)
+                sq += warp_shuffle(sq, __lane_id() ^ stride);
+
+            index_t gsq  = m * (SeqThreadPerWarp * NumWarps) + warp_id * SeqThreadPerWarp + m_idx;
+            row_norms[m] = (gsq < actual_seq) ? sq : 0.f;
+        }
+    }
+
+    // Column reduce of normalised rows: sum_hat[d] = sum_i data[i,d] / ||data[i,:]||.
+    template <index_t SeqPerThread>
+    CK_TILE_DEVICE static void column_reduce_normalised(const float* __restrict__ data,
+                                                        const float* __restrict__ row_norms,
+                                                        float (&col_acc)[KPerThread],
+                                                        index_t actual_seq)
+    {
+        const index_t tid     = static_cast<index_t>(threadIdx.x);
+        const index_t warp_id = tid / WarpSize;
+        const index_t m_idx   = (tid % WarpSize) / KThreads;
+
+        for(index_t k = 0; k < KPerThread; ++k)
+            col_acc[k] = 0.f;
+
+        for(index_t m = 0; m < SeqPerThread; ++m)
+        {
+            // Round 12: hardware fast rsqrt (v_rsq_f32, ~1 ULP) replaces sw sqrt+rcp.
+            float inv_norm = (row_norms[m] > 0.f) ? rsqrtf(row_norms[m]) : 0.f;
+            index_t gsq    = m * (SeqThreadPerWarp * NumWarps) + warp_id * SeqThreadPerWarp + m_idx;
+            if(gsq < actual_seq)
+                for(index_t k = 0; k < KPerThread; ++k)
+                    col_acc[k] += data[m * KPerThread + k] * inv_norm;
+        }
+
+        for(index_t stride = KThreads; stride < WarpSize; stride *= 2)
+            for(index_t k = 0; k < KPerThread; ++k)
+                col_acc[k] += warp_shuffle(col_acc[k], __lane_id() ^ stride);
+    }
+
+    // Scalar reduce across k_idx lanes (within warp).
+    CK_TILE_DEVICE static float reduce_across_k(float v)
+    {
+        for(index_t stride = 1; stride < KThreads; stride *= 2)
+            v += warp_shuffle(v, __lane_id() ^ stride);
+        return v;
+    }
+
+    // Full-block scalar reduce (warp xor + cross-warp LDS).
+    CK_TILE_DEVICE static float block_reduce_sum(float v, float* smem_small)
+    {
+        const index_t tid     = static_cast<index_t>(threadIdx.x);
+        const index_t warp_id = tid / WarpSize;
+        const index_t lane_id = tid % WarpSize;
+
+        for(index_t stride = 1; stride < WarpSize; stride *= 2)
+            v += warp_shuffle(v, __lane_id() ^ stride);
+        if(lane_id == 0)
+            smem_small[warp_id] = v;
+        __syncthreads();
+        if(tid == 0)
+        {
+            float s = 0.f;
+            for(index_t w = 0; w < NumWarps; ++w)
+                s += smem_small[w];
+            smem_small[0] = s;
+        }
+        __syncthreads();
+        return smem_small[0];
+    }
+
+    CK_TILE_DEVICE static float block_reduce_max(float v, float* smem_small)
+    {
+        const index_t tid     = static_cast<index_t>(threadIdx.x);
+        const index_t warp_id = tid / WarpSize;
+        const index_t lane_id = tid % WarpSize;
+
+        for(index_t stride = 1; stride < WarpSize; stride *= 2)
+            v = max(v, warp_shuffle(v, __lane_id() ^ stride));
+        if(lane_id == 0)
+            smem_small[warp_id] = v;
+        __syncthreads();
+        if(tid == 0)
+        {
+            float s = smem_small[0];
+            for(index_t w = 1; w < NumWarps; ++w)
+                s = max(s, smem_small[w]);
+            smem_small[0] = s;
+        }
+        __syncthreads();
+        return smem_small[0];
+    }
+
+    // ======================================================================
+    template <typename QWindowType, typename KWindowType>
+    CK_TILE_DEVICE void operator()(const QWindowType& q_window_in,
+                                   const KWindowType& /*k_window_in*/,
+                                   index_t seqlen_q,
+                                   index_t /*seqlen_k*/,
+                                   index_t qb,
+                                   index_t N_k,
+                                   index_t /*nhead_ratio_qk*/,
+                                   float simthreshd1,
+                                   float cdfthreshd,
+                                   float topk,
+                                   float scale,
+                                   uint8_t* block_map_ptr,
+                                   int32_t* lut_ptr,
+                                   int32_t* valid_block_num_ptr,
+                                   const float* __restrict__ pooled_k_ws_ptr,
+                                   const uint8_t* __restrict__ sim_k_ws_ptr,
+                                   void* smem_ptr) const
+    {
+        const index_t tid = static_cast<index_t>(threadIdx.x);
+
+        // R20: K-loop no longer reduces, only Phase 1 uses smem_float0.
+        // smem_float1 slab is allocated for layout compat but unused.
+        auto* smem_float0 = reinterpret_cast<float*>(smem_ptr);
+        auto* smem_scores =
+            reinterpret_cast<float*>(reinterpret_cast<char*>(smem_ptr) + kScoreOffset);
+        auto* smem_bmap =
+            reinterpret_cast<uint8_t*>(reinterpret_cast<char*>(smem_ptr) + kBmapOffset);
+        auto* smem_small =
+            reinterpret_cast<float*>(reinterpret_cast<char*>(smem_ptr) + kSmallOffset);
+
+        const index_t bs_q   = min(static_cast<index_t>(kM0), seqlen_q - qb * kM0);
+        const float inv_bs_q = (bs_q > 0) ? (1.0f / static_cast<float>(bs_q)) : 0.f;
+
+        // ==================================================================
+        // Phase 1: Q Block Statistics
+        // ==================================================================
+        auto q_tile = load_tile(q_window_in);
+
+        float q_data[MPerThread * KPerThread];
+        tile_to_float<MPerThread * KPerThread>(q_tile, q_data);
+
+        // 1a. L2 norm per token
+        float psq[MPerThread];
+        row_reduce_sq_norm<MPerThread>(q_data, psq, bs_q);
+
+        // 1b. Column sum -> mean
+        // Track F (re-apply R8 b2): drop trailing sync. Next reduce reuses same slab
+        // (smem_float0) and has its own leading __syncthreads() before reading.
+        // pooled_q_mean is register-only between reduces.
+        float pooled_q_mean[KPerThread];
+        column_reduce_thread_and_warp<MPerThread>(q_data, pooled_q_mean);
+        column_reduce_cross_warp<false>(pooled_q_mean, smem_float0);
+        for(index_t k = 0; k < KPerThread; ++k)
+            pooled_q_mean[k] *= inv_bs_q;
+
+        // 1c. Normalised sum_hat
+        // Track F (re-apply R8 b2): drop trailing sync. Next cross-warp reduce in
+        // K-loop iter 0 writes slab_a=smem_float0 (kb=0 even). Although same slab,
+        // its leading __syncthreads() covers the WAR. sum_hat register-only here.
+        float sum_hat[KPerThread];
+        column_reduce_normalised<MPerThread>(q_data, psq, sum_hat, bs_q);
+        column_reduce_cross_warp<false>(sum_hat, smem_float0);
+
+        // 1d. sim_q = ||sum_hat||^2 / bs_q^2
+        float sh_sq = 0.f;
+        for(index_t k = 0; k < KPerThread; ++k)
+            sh_sq += sum_hat[k] * sum_hat[k];
+        sh_sq               = reduce_across_k(sh_sq);
+        const float denom_q = static_cast<float>(bs_q) * static_cast<float>(bs_q);
+        const bool sim_q    = (denom_q > 0.f) && ((sh_sq / denom_q) > simthreshd1);
+
+        // Not similar → force all K blocks ON, early exit
+        if(!sim_q)
+        {
+            for(index_t i = tid; i < N_k; i += kBlockSize)
+                block_map_ptr[i] = 1;
+
+            if(lut_ptr != nullptr && tid == 0)
+            {
+                int32_t valid = 0, prev = 0;
+                for(index_t kb = 0; kb < N_k; ++kb)
+                {
+                    lut_ptr[valid] = static_cast<int32_t>(kb) - prev;
+                    prev           = static_cast<int32_t>(kb);
+                    ++valid;
+                }
+                for(index_t i = valid; i < N_k; ++i)
+                    lut_ptr[i] = 0;
+                *valid_block_num_ptr = valid;
+            }
+            return;
+        }
+
+        // ==================================================================
+        // Phase 2: K Block Loop
+        // ==================================================================
+        for(index_t i = tid; i < N_k; i += kBlockSize)
+            smem_bmap[i] = 0;
+        __syncthreads();
+
+        // R20: K-stats precomputed by Kernel A. Each thread loads its own
+        // KPerThread-slice of pooled_k_mean from DRAM workspace; sim_k is a single
+        // byte. No K-tile load, no cross-warp reduce in the K-loop.
+        const index_t lane_id_kb = tid % WarpSize;
+        const index_t k_idx_kb   = lane_id_kb % KThreads;
+
+        for(index_t kb = 0; kb < N_k; ++kb)
+        {
+            const float* p_kb = pooled_k_ws_ptr + kb * D + k_idx_kb * KPerThread;
+            float pooled_k_mean[KPerThread];
+            for(index_t k = 0; k < KPerThread; ++k)
+                pooled_k_mean[k] = p_kb[k];
+
+            float dot = 0.f;
+            for(index_t k = 0; k < KPerThread; ++k)
+                dot += pooled_q_mean[k] * pooled_k_mean[k];
+            dot = reduce_across_k(dot);
+
+            const bool sim_k = (sim_k_ws_ptr[kb] != 0);
+
+            if(tid == 0)
+            {
+                // INVARIANT (mirrors SpargeAttn ref utils.py:175-180):
+                //   ~sim_k blocks are forced ON in the bitmap (final_map[~sim_k]=1)
+                //   AND have score = -inf so Phase 3 selection (topk / cdf) does NOT
+                //   pick them again (would double-count toward topk budget).
+                // Both writes MUST stay together. Any Phase 3 selection rewrite
+                // (e.g. iterative argmax → bitonic sort) must keep the -inf write.
+                if(!sim_k)
+                {
+                    smem_bmap[kb]   = 1;
+                    smem_scores[kb] = -numeric<float>::infinity();
+                }
+                else
+                {
+                    smem_scores[kb] = dot * scale;
+                }
+            }
+        }
+        __syncthreads(); // guard Phase 3's reads of smem_bmap / smem_scores
+
+        // ==================================================================
+        // Phase 3: Softmax + Selection
+        // ==================================================================
+
+        // max
+        float lmax = -numeric<float>::infinity();
+        for(index_t i = tid; i < N_k; i += kBlockSize)
+            lmax = max(lmax, smem_scores[i]);
+        const float max_score = block_reduce_max(lmax, smem_small);
+
+        // exp + sum
+        float lsum = 0.f;
+        for(index_t i = tid; i < N_k; i += kBlockSize)
+        {
+            float e        = (smem_scores[i] > -numeric<float>::infinity())
+                                 ? __builtin_expf(smem_scores[i] - max_score)
+                                 : 0.f;
+            smem_scores[i] = e;
+            lsum += e;
+        }
+        const float sum_exp = block_reduce_sum(lsum, smem_small);
+
+        // Round 13i: argmax is invariant under positive scaling (inv_sum > 0). When
+        // topk > 0 we never read normalised values for cdfthreshd, so skip the
+        // normalise pass entirely (saves N_k LDS writes + 1 __syncthreads). The
+        // cdfthreshd path (topk <= 0) still requires normalised scores so the
+        // accumulator `cumulative_prob` matches probabilities.
+        const bool topk_active = (topk > 0.f);
+        const float inv_sum =
+            (!topk_active && sum_exp > 0.f) ? (1.0f / sum_exp) : 0.f;
+        if(!topk_active)
+        {
+            for(index_t i = tid; i < N_k; i += kBlockSize)
+                smem_scores[i] *= inv_sum;
+            __syncthreads();
+        }
+
+        // Selection: iterative argmax
+        index_t num_to_select =
+            topk_active
+                ? max(static_cast<index_t>(1), static_cast<index_t>(topk * static_cast<float>(N_k)))
+                : N_k;
+
+        float cumulative_prob = 0.f;
+        for(index_t round = 0; round < num_to_select; ++round)
+        {
+            // thread-local argmax
+            float best_val   = -1.f;
+            index_t best_idx = 0;
+            for(index_t i = tid; i < N_k; i += kBlockSize)
+            {
+                if(smem_scores[i] > best_val || (smem_scores[i] == best_val && i < best_idx))
+                {
+                    best_val = smem_scores[i];
+                    best_idx = i;
+                }
+            }
+
+            // warp argmax
+            for(index_t stride = 1; stride < WarpSize; stride *= 2)
+            {
+                float rv   = warp_shuffle(best_val, __lane_id() ^ stride);
+                index_t ri = warp_shuffle(best_idx, __lane_id() ^ stride);
+                if(rv > best_val || (rv == best_val && ri < best_idx))
+                {
+                    best_val = rv;
+                    best_idx = ri;
+                }
+            }
+
+            // cross-warp argmax via LDS
+            const index_t lane_id = tid % WarpSize;
+            const index_t warp_id = tid / WarpSize;
+            if(lane_id == 0)
+            {
+                smem_small[warp_id]            = best_val;
+                smem_small[NumWarps + warp_id] = bit_cast<float>(static_cast<int32_t>(best_idx));
+            }
+            __syncthreads();
+
+            // Round 13g: collapse 2 syncs/round into 1. tid==0 computes the global
+            // winner AND writes the sentinel (smem_bmap=1, smem_scores=-1) in the same
+            // critical section, gated by bv>0. All threads then read smem_small[0] for
+            // the early break / cumulative_prob accumulation. Saves 1 __syncthreads per
+            // round (~32 syncs @ N_k=64 topk=0.5).
+            if(tid == 0)
+            {
+                float bv   = smem_small[0];
+                index_t bi = bit_cast<int32_t>(smem_small[NumWarps]);
+                for(index_t w = 1; w < NumWarps; ++w)
+                {
+                    float wv   = smem_small[w];
+                    index_t wi = bit_cast<int32_t>(smem_small[NumWarps + w]);
+                    if(wv > bv || (wv == bv && wi < bi))
+                    {
+                        bv = wv;
+                        bi = wi;
+                    }
+                }
+                // Write sentinel into bmap/scores in the same critical section.
+                // Guarded by bv > 0 so we never poison a valid score with -1.
+                if(bv > 0.f)
+                {
+                    smem_bmap[bi]   = 1;
+                    smem_scores[bi] = -1.f;
+                }
+                smem_small[0] = bv;
+            }
+            __syncthreads();
+
+            float g_val = smem_small[0];
+
+            if(g_val <= 0.f)
+                break;
+
+            if(topk > 0.f)
+            {
+                if(round + 1 >= num_to_select)
+                    break;
+            }
+            else
+            {
+                cumulative_prob += g_val;
+                if(cumulative_prob >= cdfthreshd)
+                    break;
+            }
+        }
+
+        // ==================================================================
+        // Write outputs to global memory
+        // ==================================================================
+        for(index_t i = tid; i < N_k; i += kBlockSize)
+            block_map_ptr[i] = smem_bmap[i];
+
+        if(lut_ptr != nullptr && tid == 0)
+        {
+            int32_t valid = 0, prev = 0;
+            for(index_t kb = 0; kb < N_k; ++kb)
+            {
+                if(smem_bmap[kb] != 0)
+                {
+                    lut_ptr[valid] = static_cast<int32_t>(kb) - prev;
+                    prev           = static_cast<int32_t>(kb);
+                    ++valid;
+                }
+            }
+            for(index_t i = valid; i < N_k; ++i)
+                lut_ptr[i] = 0;
+            *valid_block_num_ptr = valid;
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/sparse_attn/pipeline/sparge_kstats_pipeline.hpp b/include/ck_tile/ops/sparse_attn/pipeline/sparge_kstats_pipeline.hpp
new file mode 100644
index 00000000000..1cb96d716a3
--- /dev/null
+++ b/include/ck_tile/ops/sparse_attn/pipeline/sparge_kstats_pipeline.hpp
@@ -0,0 +1,110 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp"
+
+namespace ck_tile {
+
+// Kernel A of the K-stat precompute split: one work-group per (b, hk, kb)
+// computes pooled_k_mean and sim_k for that K-block once. Kernel B then reads
+// from the workspace instead of recomputing per Q-block.
+template <typename Problem_>
+struct SpargeKStatsPipeline
+{
+    using Problem   = remove_cvref_t<Problem_>;
+    using Base      = SpargeBlockMapPipeline<Problem>;
+    using QDataType = typename Base::QDataType;
+    using KDataType = typename Base::KDataType;
+
+    static constexpr index_t kBlockSize = Base::kBlockSize;
+    static constexpr index_t kM0        = Base::kM0;
+    static constexpr index_t kN0        = Base::kN0;
+    static constexpr index_t D          = Base::D;
+    static constexpr index_t NumWarps   = Base::NumWarps;
+    static constexpr index_t WarpSize   = Base::WarpSize;
+
+    static constexpr index_t KPerThread       = Base::KPerThread;
+    static constexpr index_t KThreads         = Base::KThreads;
+    static constexpr index_t SeqThreadPerWarp = Base::SeqThreadPerWarp;
+    static constexpr index_t NPerThread       = Base::NPerThread;
+
+    static constexpr index_t kBlockPerCu = 1;
+
+    static constexpr index_t kColPaddedStride = Base::kColPaddedStride;
+    static constexpr index_t kPerWarpFloats   = Base::kPerWarpFloats;
+    static constexpr index_t kReduceBytes     = NumWarps * kPerWarpFloats * sizeof(float);
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return kReduceBytes; }
+
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKBlockDistribution()
+    {
+        return Base::MakeKBlockDistribution();
+    }
+
+    // operator(): one work-group, one K-block. Writes D fp32 + 1 uint8 to workspace.
+    template <typename KWindowType>
+    CK_TILE_DEVICE void operator()(const KWindowType& k_window,
+                                   index_t seqlen_k,
+                                   index_t kb,
+                                   float simthreshd1,
+                                   float* __restrict__ pooled_k_out, // D floats
+                                   uint8_t* __restrict__ sim_k_out,  // 1 byte
+                                   void* smem_ptr) const
+    {
+        const index_t tid = static_cast<index_t>(threadIdx.x);
+        auto* smem_reduce = reinterpret_cast<float*>(smem_ptr);
+
+        const index_t bs_k   = min(static_cast<index_t>(kN0), seqlen_k - kb * kN0);
+        const float inv_bs_k = (bs_k > 0) ? (1.0f / static_cast<float>(bs_k)) : 0.f;
+
+        auto k_tile = load_tile(k_window);
+
+        float k_data[NPerThread * KPerThread];
+        Base::template tile_to_float<NPerThread * KPerThread>(k_tile, k_data);
+
+        const index_t warp_id = tid / WarpSize;
+        const index_t lane_id = tid % WarpSize;
+        const index_t k_idx   = lane_id % KThreads;
+        const index_t m_idx   = lane_id / KThreads;
+
+        // pooled_k_mean: column sum then cross-warp reduce.
+        // R21A: drop trailing sync (next cross_warp_reduce has its own leading sync).
+        float pooled_k_mean[KPerThread];
+        Base::template column_reduce_thread_and_warp<NPerThread>(k_data, pooled_k_mean);
+        Base::template column_reduce_cross_warp<false>(pooled_k_mean, smem_reduce);
+        for(index_t k = 0; k < KPerThread; ++k)
+            pooled_k_mean[k] *= inv_bs_k;
+
+        // R21A: write pooled_k_mean to global early so its register liveness ends here,
+        // freeing VGPR before k_sum_hat becomes live.
+        if(warp_id == 0 && m_idx == 0)
+        {
+            for(index_t k = 0; k < KPerThread; ++k)
+                pooled_k_out[k_idx * KPerThread + k] = pooled_k_mean[k];
+        }
+
+        // K row L2 norms + normalised column sum (k_sum_hat)
+        float k_psq[NPerThread];
+        Base::template row_reduce_sq_norm<NPerThread>(k_data, k_psq, bs_k);
+
+        float k_sum_hat[KPerThread];
+        Base::template column_reduce_normalised<NPerThread>(k_data, k_psq, k_sum_hat, bs_k);
+        // R21A: drop trailing sync (no further smem read; only intra-warp shuffle + global write).
+        Base::template column_reduce_cross_warp<false>(k_sum_hat, smem_reduce);
+
+        // sim_k = (||k_sum_hat||^2 / bs_k^2) > simthreshd1
+        float ksh_sq = 0.f;
+        for(index_t k = 0; k < KPerThread; ++k)
+            ksh_sq += k_sum_hat[k] * k_sum_hat[k];
+        ksh_sq              = Base::reduce_across_k(ksh_sq);
+        const float denom_k = static_cast<float>(bs_k) * static_cast<float>(bs_k);
+        const bool sim_k    = (denom_k > 0.f) && ((ksh_sq / denom_k) > simthreshd1);
+
+        if(tid == 0)
+            *sim_k_out = sim_k ? static_cast<uint8_t>(1) : static_cast<uint8_t>(0);
+    }
+};
+
+} // namespace ck_tile