From 118b6f05745a263c29140bbf64282d1c7e9d8983 Mon Sep 17 00:00:00 2001
From: dongfengy <99041270+dongfengy@users.noreply.github.com>
Date: Tue, 26 May 2026 12:37:11 -0700
Subject: [PATCH 1/2] [https://nvbugs/6168859][fix] move tinygemm PDL release
 after reduction (#14537)

Signed-off-by: Dongfeng Yu <dongfengy@nvidia.com>
---
 .../kernels/tinygemm2/tinygemm2_kernel.cuh    |   6 +-
 tests/integration/test_lists/waives.txt       | 786 +++++++++---------
 2 files changed, 402 insertions(+), 390 deletions(-)

diff --git a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh
index ca95f6849bc..4592fe1d110 100644
--- a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh
+++ b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh
@@ -236,7 +236,6 @@ __global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output,
         if (!weight_warp)
         {
             cudaGridDependencySynchronize();
-            cudaTriggerProgrammaticLaunchCompletion();
         }
 
         for (int ki = 0; ki < K_LOOPS_DMA; ki++)
@@ -422,6 +421,11 @@ __global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output,
 
         __syncthreads();
 
+        if (threadIdx.x == 0) // one thread per block suffices according to official code examples
+        {
+            cudaTriggerProgrammaticLaunchCompletion();
+        }
+
         if (warp_id == 0)
         {
 
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 686b6211843..b26944b57de 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -1,413 +1,421 @@
-examples/test_openai.py::test_llm_openai_triton_1gpu SKIP (https://nvbugspro.nvidia.com/bug/4963654)
-examples/test_openai.py::test_llm_openai_triton_plugingen_1gpu SKIP (https://nvbugspro.nvidia.com/bug/4963654)
-full:GH200/examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (arm is not supported)
-full:GH200/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (arm is not supported)
-perf/test_perf.py::test_perf[t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
-perf/test_perf.py::test_perf[flan_t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
-perf/test_perf.py::test_perf[bart_large_cnn-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
-full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514)
-perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:512,200-quant:fp8-tp:4] SKIP (SKIP due to timeout of quantization)
-perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-quant:fp8-gpus:8] SKIP (SKIP due to timeout of quantization)
+accuracy/test_cli_flow.py::TestGptNext::test_auto_dtype SKIP (https://nvbugs/6162940)
+accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/6120535)
+accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4] SKIP (https://nvbugs/6189918)
+accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2] SKIP (https://nvbugs/6189918)
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_kv_cache_v2_nixl_python SKIP (https://nvbugs/6184575)
+accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] SKIP (https://nvbugs/6117811)
+accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/6117811)
+accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_kv_cache_v2_nixl_python SKIP (https://nvbugs/6117811)
+accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664)
+accuracy/test_dwdp_disaggregated_serving.py::TestDwdpDeepSeekV3Lite::test_dwdp_accuracy SKIP (https://nvbugs/6094102)
+accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5772995)
+accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] SKIP (https://nvbugs/5346443)
+accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197)
+accuracy/test_llm_api_autodeploy.py::TestGemma4MoE::test_bf16 SKIP (https://nvbugs/6158397)
+accuracy/test_llm_api_autodeploy.py::TestGemmaE2B::test_gemma4_e2b_it SKIP (https://nvbugs/6194934)
+accuracy/test_llm_api_autodeploy.py::TestMiniMaxM2::test_finegrained_fp8 SKIP (https://nvbugs/6158397)
+accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[nvfp4-1-trtllm] SKIP (https://nvbugs/6200112)
+accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_functional_small[bf16] SKIP (https://nvbugs/6162114)
+accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_functional_small[fp8] SKIP (https://nvbugs/6162114)
+accuracy/test_llm_api_autodeploy.py::TestNemotronUltraV3::test_accuracy[nvfp4-8] SKIP (https://nvbugs/6215690)
+accuracy/test_llm_api_autodeploy.py::TestQwen3_5_397B_MoE::test_bf16_small[4] SKIP (https://nvbugs/6158397)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/6215736)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] SKIP (https://nvbugs/6084775)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/6029882)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/6018046)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] SKIP (https://nvbugs/6215793)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus SKIP (https://nvbugs/6193778)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_dsa_host_cache_offload[host_cache_offload] SKIP (https://nvbugs/6185196)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_dsa_host_cache_offload[host_cache_offload_mtp1] SKIP (https://nvbugs/6185196)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_dsa_host_cache_offload[host_cache_offload_mtp3_no_adp] SKIP (https://nvbugs/6185196)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] SKIP (https://nvbugs/6185196)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] SKIP (https://nvbugs/5955792)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[disable_skip_indexer] SKIP (https://nvbugs/5859886)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency_default] SKIP (https://nvbugs/6185196)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] SKIP (https://nvbugs/6185196)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] SKIP (https://nvbugs/6185196)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_piecewise_cuda_graph[baseline] SKIP (https://nvbugs/6185196)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_piecewise_cuda_graph[mtp3_fp8kv_chunked] SKIP (https://nvbugs/5989920)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6084720)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6095851)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/6162860)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6050489)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6162115)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6050489)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6050489)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/6162115)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/6162115)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6198785)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_python_scheduler[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-enable_chunked_prefill=True] SKIP (https://nvbugs/6071081)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_nvfp4_4gpus[tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6185146)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6195110)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6162115)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6162115)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6112497)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6162122)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6084447)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6084447)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6084447)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=True] SKIP (https://nvbugs/6084447)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6084447)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=True] SKIP (https://nvbugs/6084447)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6084447)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=0] SKIP (https://nvbugs/6084447)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=2] SKIP (https://nvbugs/6084447)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=0] SKIP (https://nvbugs/6162122)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=2] SKIP (https://nvbugs/6162122)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=0] SKIP (https://nvbugs/6162122)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/6162122)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6162122)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6162115)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6162115)
+accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype SKIP (https://nvbugs/6164924)
+accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_2_model_mtp[2model] SKIP (https://nvbugs/5981293)
+accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5981293)
+accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput] SKIP (https://nvbugs/5981293)
+accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput_trtllm] SKIP (https://nvbugs/5981293)
+accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model] SKIP (https://nvbugs/5772993)
+accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5772360)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_dflash SKIP (https://nvbugs/6156233)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v1_kv_cache-trtllm-one_model-no_overlap_scheduler] SKIP (https://nvbugs/6220815)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-cutlass-two_model-no_overlap_scheduler] SKIP (https://nvbugs/6211880)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-trtllm-one_model-overlap_scheduler] SKIP (https://nvbugs/6215702)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model] SKIP (https://nvbugs/5596343)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model] SKIP (https://nvbugs/5596343)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto] SKIP (https://nvbugs/6026676)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/6153955)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto] SKIP (https://nvbugs/5596343)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-fp8] SKIP (https://nvbugs/5651865)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5596343)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/6109750)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-cutlass-auto] SKIP (https://nvbugs/5596343)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] SKIP (https://nvbugs/5596343)
+accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype SKIP (https://nvbugs/6209806)
+accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4_longseq_trtllm_moe_async_cancel SKIP (https://nvbugs/6160085)
+accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4_longseq_trtllm_moe_stress SKIP (https://nvbugs/6160085)
+accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_bf16 SKIP (https://nvbugs/6211185)
+accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_fp8 SKIP (https://nvbugs/6211185)
+accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_nvfp4 SKIP (https://nvbugs/6211185)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5616182)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[use_temperature=False-attn_backend=TRTLLM] SKIP (https://nvbugs/5997547)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_dflash SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/6050489)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler SKIP (https://nvbugs/6112497)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] SKIP (https://nvbugs/6076767)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[llguidance-eagle3_one_model=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[llguidance-eagle3_one_model=True] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=False] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=True] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[llguidance] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[xgrammar] SKIP (https://nvbugs/6141653)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard[overlap_scheduler=False] SKIP (https://nvbugs/6162114)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard[overlap_scheduler=True] SKIP (https://nvbugs/6162114)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard_sa SKIP (https://nvbugs/6162114)
+accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype SKIP (https://nvbugs/5520319)
+accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False-enable_gemm_allreduce_fusion=False] SKIP (https://nvbugs/6162122)
+accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False-enable_gemm_allreduce_fusion=True] SKIP (https://nvbugs/6162122)
+accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] SKIP (https://nvbugs/5775326)
+accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True] SKIP (https://nvbugs/5821415)
+accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True] SKIP (https://nvbugs/5821415)
+accuracy/test_llm_api_pytorch.py::TestMiniMaxM2::test_4gpus[attention_dp=False-cuda_graph=True-overlap_scheduler=True-tp_size=4-ep_size=4] SKIP (https://nvbugs/6159132)
+accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] SKIP (https://nvbugs/6163033)
+accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm] SKIP (https://nvbugs/6162121)
+accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm_eagle] SKIP (https://nvbugs/6157892)
+accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus_mtp SKIP (https://nvbugs/6211693)
+accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype SKIP (https://nvbugs/6076767)
+accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap_adp_on] SKIP (https://nvbugs/6094068)
+accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] SKIP (https://nvbugs/6116088)
+accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4_adp_on-trtllm] SKIP (https://nvbugs/6094068)
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] SKIP (https://nvbugs/6177390)
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/6177390)
+accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=False] SKIP (https://nvbugs/6212252)
+accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=True] SKIP (https://nvbugs/6210714)
+accuracy/test_llm_api_pytorch.py::TestQwen3_5_9B::test_bf16[mtp_off] SKIP (https://nvbugs/6212250)
+accuracy/test_llm_api_pytorch.py::TestQwen3_5_9B::test_bf16[mtp_on] SKIP (https://nvbugs/6212250)
+accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales_early_first_token_response SKIP (https://nvbugs/6200128)
+accuracy/test_llm_api_pytorch_multimodal.py::TestExaone4_5_33B::test_auto_dtype[forced_chunked_prefill] SKIP (https://nvbugs/6211189)
+accuracy/test_llm_api_pytorch_multimodal.py::TestExaone4_5_33B::test_auto_dtype[full_budget] SKIP (https://nvbugs/6211189)
+accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized SKIP (https://nvbugs/6189416)
+accuracy/test_llm_api_pytorch_multimodal.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm] SKIP (https://nvbugs/6181383)
+accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL::test_auto_dtype[forced_chunked_prefill] SKIP (https://nvbugs/6143787)
+accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype SKIP (https://nvbugs/6114464)
+accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray SKIP (https://nvbugs/6094070)
+cpp/test_e2e.py::test_benchmarks[bart-90] SKIP (https://nvbugs/5550689)
+cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5550689)
+cpp/test_e2e.py::test_model[-bart-90] SKIP (https://nvbugs/6162804)
 cpp/test_e2e.py::test_model[-encoder-90] SKIP (waive Encoder-only test because it doesn't take batched input)
-full:GH200/unittest/trt/model_api/test_model_quantization.py SKIP (https://nvbugspro.nvidia.com/bug/4979955)
+cpp/test_e2e.py::test_model[-gpt-80] SKIP (https://nvbugs/5983283)
+cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665)
+cpp/test_e2e.py::test_model[-redrafter-86] SKIP (https://nvbugs/5761642)
+cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)
+cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-nixl_kvcache-90] SKIP (https://nvbugs/6093820)
+cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-ucx_kvcache-90] SKIP (https://nvbugs/6093820)
+cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199)
+disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
+disaggregated/test_disaggregated.py::test_disaggregated_cancel_large_context_requests[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6105768)
+disaggregated/test_disaggregated.py::test_disaggregated_chat_completion_tool_calls[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
+disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2_genpp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6114140)
+disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6184906)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_conditional[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162324)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_gen_only[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap_cuda_graph[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxtp2ep2pp2_gentp4_one_mtp_block_reuse[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxtp2ep2pp2_gentp4_one_mtp_block_reuse_long_prompt[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_overlap_cuda_graph[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_attention_dp_overlap_one_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_diff_max_tokens[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
+disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6162322)
+disaggregated/test_disaggregated.py::test_disaggregated_kv_cache_time_output[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6184906)
+disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
+disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
+disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
+disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_trt_backend[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
+disaggregated/test_disaggregated.py::test_disaggregated_perf_metrics[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
+disaggregated/test_disaggregated.py::test_disaggregated_single_gpu[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6184906)
+disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_trt_backend[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
+disaggregated/test_workers.py::test_workers_conditional_disaggregation_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6094100)
+disaggregated/test_workers.py::test_workers_conversation_router[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6162322)
+disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
+disaggregated/test_workers.py::test_workers_kv_cache_aware_router_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322)
+disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6162322)
+disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6114139)
+examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
+examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5234058)
+examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5234058)
+examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1] SKIP (https://nvbugs/5546507)
+examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] SKIP (https://nvbugs/5546507)
+examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8] SKIP (https://nvbugs/5546507)
+examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] SKIP (https://nvbugs/5606233)
+examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5431132)
+examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979)
+examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979)
+examples/test_llama.py::test_llm_llama_1gpu_fp8_kv_cache[llama-v2-7b-hf-bfloat16] SKIP (https://nvbugs/5527940)
+examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5802248)
+examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5333849)
+examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5222697)
+examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
+examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
+examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
 examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5014327)
-examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4866931)
 examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (https://nvbugs/4961624)
-full:sm100/unittest/trt/functional SKIP (Disable for Blackwell)
-full:sm100/unittest/trt/quantization SKIP (Disable for Blackwell)
-full:sm100/unittest/trt/attention/test_bert_attention.py SKIP (Disable for Blackwell)
-full:sm100/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell)
-full:sm100/unittest/bindings SKIP (Disable for Blackwell)
-full:sm100/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
-full:sm100/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell)
-full:sm100/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell)
-full:sm100/unittest/trt/model/test_gpt.py -k "partition0" SKIP (Disable for Blackwell)
-full:sm100/unittest/test_model_runner_cpp.py SKIP (Disable for Blackwell)
-full:sm100/unittest/llmapi/test_llm_models.py -m "part0" SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 80/96)
-full:sm100/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (megatron-core 0.8 is not supported in python 3.12)
-full:sm100/unittest/llmapi/test_llm_models.py -m "not (part0 or part1)" SKIP (Disable for Blackwell OOM)
-full:sm100/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (megatron-core 0.8 is not supported in python 3.12)
-full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
-full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
-full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
-full:B200/perf/test_perf.py::test_perf[bart_large_cnn] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[bert_large] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[flan_t5_base] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[flan_t5_large] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[flan_t5_xl] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[flan_t5_xxl] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[mbart_large_50_many_to_one_mmt] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[roberta_base] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[t5_11b] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[t5_3b] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[t5_base] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[t5_large] SKIP (bert_attention_plugin does not support SM >= 100)
-full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
-full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
-full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
+examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] SKIP (https://nvbugs/5444636)
+examples/test_nemotron_nas.py::test_nemotron_nas_summary_2gpu[DeciLM-7B] SKIP (https://nvbugs/5444636)
+examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] SKIP (https://nvbugs/5612313)
+examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (https://nvbugs/5447530)
+examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5612502)
 examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5174573)
+examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5419070)
+examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5401233)
+examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5232405)
 examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-disable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
 examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
 examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
-examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5222697)
-examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] SKIP (https://nvbugspro.nvidia.com/bug/5226339)
-perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20] SKIP # https://nvbugspro.nvidia.com/bug/5207477
+examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233)
+examples/test_visual_gen.py::test_flux1_lpips_against_golden SKIP (https://nvbugs/6215688)
+examples/test_visual_gen.py::test_flux2_lpips_against_golden SKIP (https://nvbugs/6215688)
+examples/test_visual_gen.py::test_ltx2_lpips_against_golden SKIP (https://nvbugs/6215688)
+examples/test_visual_gen.py::test_wan21_t2v_lpips_against_golden SKIP (https://nvbugs/6215688)
+examples/test_visual_gen.py::test_wan22_t2v_lpips_against_golden SKIP (https://nvbugs/6215688)
+examples/test_visual_gen.py::test_wan_t2v_example SKIP (https://nvbugs/6215688)
+examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570)
+full:A10/unittest/kv_cache_manager_v2_tests/ SKIP (https://nvbugs/5841954)
+full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-False] SKIP (https://nvbugs/6185480)
+full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-True] SKIP (https://nvbugs/6185480)
+full:B200/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/6185150)
+full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugs/5161074)
+full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugs/5161074)
+full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugs/5161074)
+full:B300/unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "TRTLLM" SKIP (https://nvbugs/6165866)
+full:DGX_H100/kv_cache/test_prefix_aware_scheduling.py::TestServePrefixAwareScheduling::test_multi_round_qa_shared_prefix[swa-chunked] SKIP (https://nvbugs/6136737)
+full:GB200-OCI/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/6185150)
+full:GB200/perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6194788)
+full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514)
+full:GH200/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (arm is not supported)
+full:GH200/examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (arm is not supported)
+full:GH200/unittest/trt/model_api/test_model_quantization.py SKIP (https://nvbugs/4979955)
+full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551)
+full:H20/accuracy/test_llm_api_autodeploy.py::TestNemotronV2::test_auto_dtype[False] SKIP (https://nvbugs/6185173)
+full:H20/accuracy/test_llm_api_autodeploy.py::TestNemotronV2::test_auto_dtype[True] SKIP (https://nvbugs/6185173)
+full:H20/accuracy/test_llm_api_autodeploy.py::TestNemotronV2::test_fp8[True] SKIP (https://nvbugs/6185173)
+full:H20/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto] SKIP (https://nvbugs/6026676)
+full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696)
+full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5948435)
+full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/5961814)
+full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5961814)
+full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5929339)
+full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/6128419)
+full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugs/5161074)
+full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugs/5161074)
+full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugs/5161074)
+full:sm100/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (megatron-core 0.8 is not supported in python 3.12)
+full:sm100/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (megatron-core 0.8 is not supported in python 3.12)
+full:sm100/unittest/bindings SKIP (Disable for Blackwell)
+full:sm100/unittest/llmapi/test_llm_models.py -m "not (part0 or part1)" SKIP (Disable for Blackwell OOM)
+full:sm100/unittest/llmapi/test_llm_models.py -m "part0" SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 80/96)
+full:sm100/unittest/test_model_runner_cpp.py SKIP (Disable for Blackwell)
+full:sm100/unittest/trt/attention/test_bert_attention.py SKIP (Disable for Blackwell)
+full:sm100/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
+full:sm100/unittest/trt/functional SKIP (Disable for Blackwell)
+full:sm100/unittest/trt/model/test_gpt.py -k "partition0" SKIP (Disable for Blackwell)
+full:sm100/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell)
+full:sm100/unittest/trt/quantization SKIP (Disable for Blackwell)
+full:sm100/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell)
+full:sm100/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell)
+llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_eagle3 SKIP (https://nvbugs/6075431)
+llmapi/test_llm_examples.py::test_llmapi_tensorrt_engine SKIP (https://nvbugs/5820553)
+perf/test_perf.py::test_perf[bart_large_cnn-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
 perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20] SKIP
-perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20] SKIP
+perf/test_perf.py::test_perf[flan_t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
 perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20-gpus:2] SKIP
+perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20] SKIP
 perf/test_perf.py::test_perf[flan_t5_large-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] SKIP
-perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20] SKIP
-perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:128,128] SKIP
-perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:512,32] SKIP
+perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8] SKIP
+perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] SKIP
+perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-quant:fp8-gpus:8] SKIP (SKIP due to timeout of quantization)
+perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:512,200-quant:fp8-tp:4] SKIP (SKIP due to timeout of quantization)
 perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:128,128] SKIP
 perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:512,32] SKIP
+perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:128,128] SKIP
+perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:512,32] SKIP
 perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2] SKIP
 perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] SKIP
-perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8] SKIP
-perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] SKIP
-full:NVIDIA_B200/perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8] SKIP (https://nvbugspro.nvidia.com/bug/5150255)
-full:NVIDIA_B200/perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] SKIP (https://nvbugspro.nvidia.com/bug/5150255)
-full:NVIDIA_B200/perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] SKIP (https://nvbugspro.nvidia.com/bug/5150255)
-full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:128,128] SKIP (https://nvbugspro.nvidia.com/bug/5150255)
-full:B200/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP (https://nvbugspro.nvidia.com/bug/5150255)
-examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5232405)
-examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
-examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
-examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5234058)
-examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5234058)
-examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570)
-triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5240060)
-triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP
-triton_server/test_triton.py::test_qwen2_vl[qwen2_vl] SKIP
-triton_server/test_triton.py::test_gpt_ib_speculative_decoding_bls[gpt-ib-speculative-decoding-bls] SKIP
-triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False-ensemble] SKIP
-perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,200-quant:fp8-tp:4] SKIP (https://nvbugspro.nvidia.com/bug/5304388)
-perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-cppmanager-exe-plugin_ifb-float16-input_output_len:200,2000-reqs:64-gpus:8] SKIP (https://nvbugspro.nvidia.com/bug/5304388)
-perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5304388)
-perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:20000,2000-quant:fp8-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5304388)
-perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5304388)
-perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:20000,2000-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5304388)
-perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5304388)
-perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:20000,2000-reqs:500-con:250] SKIP (https://nvbugspro.nvidia.com/bug/5304388)
-perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:5000,500-reqs:500-con:250] SKIP (https://nvbugspro.nvidia.com/bug/5304388)
-perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:200,2000-reqs:64-con:1-gpus:8] SKIP (https://nvbugspro.nvidia.com/bug/5304388)
-perf/test_perf.py::test_perf[llama_v3.1_8b-cppmanager-exe-plugin_ifb-bfloat16-mp-maxbs:256-input_output_len:128,128-pp:2] SKIP (https://nvbugspro.nvidia.com/bug/5295390)
-perf/test_perf.py::test_perf[llama_70b_sq_per_tensor-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-gpus:2] SKIP (https://nvbugspro.nvidia.com/bug/5295390)
-perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-ootb_except_mha-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390)
-perf/test_perf.py::test_perf[mixtral_8x7b-cppmanager-exe-plugin_ifb-float16-mp-input_output_len:128,128+512,32-gpus:2] SKIP (https://nvbugspro.nvidia.com/bug/5295390)
-perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-ootb_except_mha-float16-input_output_len:128,128+512,32-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390)
-perf/test_perf.py::test_perf[llama_v3_8b_instruct-cppmanager-exe-plugin_ifb-bfloat16-gwp:0.5-input_output_len:128,128+512,32] SKIP (https://nvbugspro.nvidia.com/bug/5295390)
-perf/test_perf.py::test_perf[gpt_350m_moe-cpp-plugin-float16-bs:64-input_output_len:128,8+512,32] SKIP (https://nvbugspro.nvidia.com/bug/5295390)
-perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-exe-plugin_ifb-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390)
-perf/test_perf.py::test_perf[flan_t5_xxl-cppmanager-exe-plugin_ifb-float16-input_output_len:512,32-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390)
-perf/test_perf.py::test_perf[flan_t5_xxl-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390)
-perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390)
-perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:1024,1024-tp:2-gpus:2] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
-perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8-gpus:2] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
-perf/test_perf.py::test_perf[bart_large_cnn-bench-float16-input_output_len:128,20] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
-perf/test_perf.py::test_perf[mamba_130m-bench-float16-input_output_len:128,128] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
-perf/test_perf.py::test_perf[bert_large-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
-perf/test_perf.py::test_perf[roberta_base-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
-examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5333849)
-examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
-examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
-triton_server/test_triton.py::test_mllama[mllama] SKIP (https://nvbugs/5333818)
-examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
-unittest/llmapi/test_llm_multi_gpu.py -m "gpu4 and part0" SKIP (https://nvbugs/5348958)
-accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] SKIP (https://nvbugs/5346443)
-examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5401233)
-examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233)
-examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5419070)
-examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5431132)
-examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] SKIP (https://nvbugs/5444636)
-examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (https://nvbugs/5447530)
-examples/test_nemotron_nas.py::test_nemotron_nas_summary_2gpu[DeciLM-7B] SKIP (https://nvbugs/5444636)
-examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5453709)
-examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5453709)
-triton_server/test_triton.py::test_gpt_ib[gpt-ib] SKIP (https://nvbugs/5431116)
-accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197)
-triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349)
-triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624)
-triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343)
-triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482)
-triton_server/test_triton.py::test_python_bls_unit_tests[python-bls-unit-tests] SKIP (https://nvbugs/5477392)
-triton_server/test_triton.py::test_mistral_ib[mistral-ib] SKIP (https://nvbugs/5477399)
-triton_server/test_triton.py::test_eagle[eagle] SKIP (https://nvbugs/5477378)
+perf/test_perf.py::test_perf[t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
+perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20] SKIP
+perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6215810)
+perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] SKIP (https://nvbugs/6167060)
+perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_dep8_mtp1_8k1k] SKIP (https://nvbugs/6190071)
+perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_tep8_mtp3_8k1k] SKIP (https://nvbugs/6189928)
+perf/test_perf_sanity.py::test_e2e[aggr_upload-llama3_1_8b_fp8_ad_hopper-llama3_1_8b_ad_ws1_1k1k] SKIP (https://nvbugs/6192201)
+perf/test_perf_sanity.py::test_e2e[aggr_upload-super_ad_blackwell-super_ad_ws1_1k1k] SKIP (https://nvbugs/6153575)
+perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6215844)
+perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] SKIP (https://nvbugs/6085022)
+perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6179661)
+perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6215844)
+perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] SKIP (https://nvbugs/6085022)
+perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6016528)
+perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] SKIP (https://nvbugs/6085022)
+perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6085022)
+perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL] SKIP (https://nvbugs/6200257)
+perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6215844)
+perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-flux2_blackwell-flux2_fp8_cfg1_ulysses4_teacache_on] SKIP (https://nvbugs/6162857)
+perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-ltx2_blackwell-ltx2_2stage_bf16_i2v_cfg2_ulysses4_compile_on] SKIP (https://nvbugs/6162857)
+perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-ltx2_blackwell-ltx2_2stage_bf16_t2v_cfg2_ulysses4_compile_on] SKIP (https://nvbugs/6162857)
+perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-ltx2_blackwell-ltx2_nvfp4_i2v_cfg2_ulysses4_compile_on] SKIP (https://nvbugs/6162857)
+perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-wan21_t2v_14b_blackwell-wan21_14b_nvfp4_trtllm_cfg2_ulysses4_teacache_on] SKIP (https://nvbugs/6162857)
+perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-wan22_i2v_a14b_blackwell-wan22_i2v_a14b_nvfp4_trtllm_cfg2_ulysses4] SKIP (https://nvbugs/6162857)
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy] SKIP (https://nvbugs/6207678)
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy] SKIP (https://nvbugs/6207678)
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp8-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy] SKIP (https://nvbugs/6143599)
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy] SKIP (https://nvbugs/6143599)
+stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy] SKIP (https://nvbugs/6143599)
+stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] SKIP (https://nvbugs/6215678)
+stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] SKIP (https://nvbugs/6215678)
+test_doc.py::test_url_validity SKIP (https://nvbugs/6215684)
+test_e2e.py::test_draft_token_tree_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] SKIP (https://nvbugs/5989907)
+test_e2e.py::test_draft_token_tree_quickstart_advanced_eagle3_depth_1_tree[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] SKIP (https://nvbugs/5989907)
+test_e2e.py::test_multi_nodes_eval[DeepSeek-R1/DeepSeek-R1-0528-FP4-tp16-mmlu] SKIP (https://nvbugs/6114608)
+test_e2e.py::test_multi_nodes_eval[Kimi-K2-Thinking-NVFP4-tp16-mmlu] SKIP (https://nvbugs/6114608)
+test_e2e.py::test_multi_nodes_eval[Qwen3/Qwen3-235B-A22B-tp16-mmlu] SKIP (https://nvbugs/6115560)
+test_e2e.py::test_multi_nodes_eval[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-tp16-mmlu] SKIP (https://nvbugs/6114608)
+test_e2e.py::test_multi_nodes_eval[nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-tp16-mmlu] SKIP (https://nvbugs/6114608)
 test_e2e.py::test_openai_chat_example[trt] SKIP (https://nvbugs/5477444)
+test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450)
+test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp1pp2-gen_tp1pp2] SKIP (https://nvbugs/6190759)
+test_e2e.py::test_openai_disagg_multi_nodes_completion_service_discovery[http] SKIP (https://nvbugs/6115562)
+test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus[DeepSeek-R1-W4AFP8-DeepSeek-R1/DeepSeek-R1-W4AFP8] SKIP (https://nvbugs/5836830)
 test_e2e.py::test_trtllm_bench_iteration_log[TRT-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5448523)
-accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype SKIP (https://nvbugs/5520319)
-examples/test_llama.py::test_llm_llama_1gpu_fp8_kv_cache[llama-v2-7b-hf-bfloat16] SKIP (https://nvbugs/5527940)
-examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8] SKIP (https://nvbugs/5546507)
-examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1] SKIP (https://nvbugs/5546507)
-examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] SKIP (https://nvbugs/5546507)
-cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5550689)
-cpp/test_e2e.py::test_benchmarks[bart-90] SKIP (https://nvbugs/5550689)
-full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
-full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5574553)
-full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True] SKIP (https://nvbugs/5574553)
-full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5574553)
-full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
-full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
-full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
-full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5574553)
-full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True] SKIP (https://nvbugs/5574553)
-full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5574553)
-full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
-full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
+test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5864769)
+triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359)
+triton_server/test_triton.py::test_eagle[eagle] SKIP (https://nvbugs/5477378)
 triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora] SKIP (https://nvbugs/5470830)
-unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781)
+triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118)
+triton_server/test_triton.py::test_gpt_gather_logits[gpt-gather-logits] SKIP (https://nvbugs/5766960)
+triton_server/test_triton.py::test_gpt_ib[gpt-ib] SKIP (https://nvbugs/5431116)
+triton_server/test_triton.py::test_gpt_ib_lad[gpt-ib-lad] SKIP (https://nvbugs/5775223)
+triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624)
+triton_server/test_triton.py::test_gpt_ib_speculative_decoding_bls[gpt-ib-speculative-decoding-bls] SKIP
+triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349)
+triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-decoding] SKIP (https://nvbugs/5762854)
 triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414)
-full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto] SKIP (https://nvbugs/5596343)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-cutlass-auto] SKIP (https://nvbugs/5596343)
-examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] SKIP (https://nvbugs/5612313)
-triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359)
-triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369)
-examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] SKIP (https://nvbugs/5606233)
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5616182)
-full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551)
-test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450)
+triton_server/test_triton.py::test_llava_onevision[llava_onevision] SKIP (https://nvbugs/5775205)
+triton_server/test_triton.py::test_mistral_ib[mistral-ib] SKIP (https://nvbugs/5477399)
+triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343)
+triton_server/test_triton.py::test_mllama[mllama] SKIP (https://nvbugs/5333818)
+triton_server/test_triton.py::test_python_bls_unit_tests[python-bls-unit-tests] SKIP (https://nvbugs/5477392)
+triton_server/test_triton.py::test_qwen2_vl[qwen2_vl] SKIP
+triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482)
+triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP
 triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm] SKIP (https://nvbugs/5701480)
+triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False-ensemble] SKIP
+triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5240060)
+triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369)
+unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend[act=Relu2-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] SKIP (https://nvbugs/5989912)
+unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=IGNORE-e8_k1_h512_i512-seq=8-dtype=torch.bfloat16-backend=MEGAMOE_DEEPGEMM-quant=W4A8_MXFP4_MXFP8-routing=DeepSeekV3] SKIP (https://nvbugs/6175060)
+unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb -k "MEGAMOE_DEEPGEMM" SKIP (https://nvbugs/6175060)
+unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e256_k8_h7168_i2048-seq=1-dtype=torch.bfloat16-backend=MEGAMOE_DEEPGEMM-quant=W4A8_MXFP4_MXFP8-routing=DeepSeekV3] SKIP (https://nvbugs/6175060)
 unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py::TestLoraAttentionPytorchFlowVsTRT::test_lora_attention SKIP (https://nvbugs/5701421)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] SKIP (https://nvbugs/5596343)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460)
+unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460)
+unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_cp_tp_broadcast_object[tp_cp_broadcast-list] SKIP (https://nvbugs/6132301)
+unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingDSv3-swiglu-1024-1024-1] SKIP (https://nvbugs/5908070)
+unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_qwen_next-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070)
+unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_topk_4-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070)
+unittest/_torch/visual_gen/test_flux_pipeline.py::TestFluxCombinedOptimizations::test_all_optimizations_combined SKIP (https://nvbugs/6199854)
+unittest/auto_deploy/singlegpu/models/test_qwen3_5_moe.py::test_vision_attention_matches_reference SKIP (https://nvbugs/6189450)
+unittest/auto_deploy/singlegpu/models/test_qwen3_5_moe.py::test_vision_block_matches_reference SKIP (https://nvbugs/6189450)
+unittest/auto_deploy/singlegpu/models/test_qwen3_5_moe.py::test_vlm_wrapper_delta_is_request_scoped_no_cross_call_leakage SKIP (https://nvbugs/6189450)
+unittest/auto_deploy/singlegpu/smoke/test_ad_build_small_single.py::test_build_ad[deepseek-ai/DeepSeek-V3-llm_extra_args10] SKIP (https://nvbugs/5888827)
+unittest/auto_deploy/standalone SKIP (https://nvbugs/6160629)
+unittest/auto_deploy/standalone/test_standalone_package.py::TestStandalonePackage::test_run_unit_tests SKIP (https://nvbugs/6160629)
+unittest/disaggregated/test_agent_multi_backends.py::test_run_with_different_env[1] SKIP (https://nvbugs/5979673)
 unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_async SKIP (https://nvbugs/5741476)
-cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)
-examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979)
-examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979)
-unittest/_torch/speculative/test_dynamic_spec_decode.py::test_dynamic_spec_decode SKIP (https://nvbugs/5758449)
-triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118)
-triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-decoding] SKIP (https://nvbugs/5762854)
-examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5612502)
 unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741)
 unittest/executor/test_rpc_worker.py SKIP (https://nvbugs/5605741)
-cpp/test_e2e.py::test_model[-redrafter-86] SKIP (https://nvbugs/5761642)
-triton_server/test_triton.py::test_gpt_gather_logits[gpt-gather-logits] SKIP (https://nvbugs/5766960)
-accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5772360)
-accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model] SKIP (https://nvbugs/5772993)
-accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5772995)
-accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] SKIP (https://nvbugs/5775326)
-triton_server/test_triton.py::test_llava_onevision[llava_onevision] SKIP (https://nvbugs/5775205)
-triton_server/test_triton.py::test_gpt_ib_lad[gpt-ib-lad] SKIP (https://nvbugs/5775223)
-cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665)
-unittest/_torch/ray_orchestrator/multi_gpu/test_multi_instance.py::test_multi_instance[tp2_2instances] SKIP (https://nvbugs/5784566)
-accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False] SKIP (https://nvbugs/5795918)
-examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5802248)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model] SKIP (https://nvbugs/5596343)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model] SKIP (https://nvbugs/5596343)
-examples/test_ray.py::test_llm_inference_distributed_ray[tp2pp2] SKIP (https://nvbugs/5781731)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/5819005)
-unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks SKIP (https://nvbugs/5819014)
-unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_gptoss_style_nvfp4[limit1-beta0-alpha1-RoutingGPTOSS-512-512-1] SKIP (https://nvbugs/5819042)
-llmapi/test_llm_examples.py::test_llmapi_tensorrt_engine SKIP (https://nvbugs/5820553)
-accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse SKIP (https://nvbugs/5820497)
-accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar] SKIP (https://nvbugs/5820497)
-accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True] SKIP (https://nvbugs/5821415)
-accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True] SKIP (https://nvbugs/5821415)
-test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus[DeepSeek-R1-W4AFP8-DeepSeek-R1/DeepSeek-R1-W4AFP8] SKIP (https://nvbugs/5836830)
-accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664)
-cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5838211)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5838211)
-full:A10/unittest/kv_cache_manager_v2_tests/ SKIP (https://nvbugs/5841954)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[disable_skip_indexer] SKIP (https://nvbugs/5859886)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-fp8] SKIP (https://nvbugs/5651865)
-test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5864769)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5879577)
-unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_gptoss_style_nvfp4[limitinf-beta0-alpha0.1-RoutingGPTOSS-512-512-1] SKIP (https://nvbugs/5819042)
-unittest/_torch/flashinfer/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5920779)
-test_e2e.py::test_openai_chat_guided_decoding[openai/gpt-oss-120b] SKIP (https://nvbugs/5884677)
-unittest/auto_deploy/singlegpu/smoke/test_ad_build_small_single.py::test_build_ad[deepseek-ai/DeepSeek-V3-llm_extra_args10] SKIP (https://nvbugs/5888827)
-unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingDSv3-swiglu-1024-1024-1] SKIP (https://nvbugspro.nvidia.com/bug/5908070)
-unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_qwen_next-swiglu-1024-1024-150] SKIP (https://nvbugspro.nvidia.com/bug/5908070)
-unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_topk_4-swiglu-1024-1024-150] SKIP (https://nvbugspro.nvidia.com/bug/5908070)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916092)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugspro.nvidia.com/bug/5916155)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugspro.nvidia.com/bug/5916155)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugspro.nvidia.com/bug/5916155)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916155)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916155)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugspro.nvidia.com/bug/5916155)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916155)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916155)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916155)
-unittest/_torch/visual_gen/test_wan.py::TestWanTwoStageTransformer::test_two_stage_with_trtllm_attention SKIP (https://nvbugspro.nvidia.com/bug/5916830)
-full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5929339)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460)
-unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460)
-cpp/test_e2e.py::test_model[-gpt-80] SKIP (https://nvbugs/5983283)
-full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-cutlass-one_model-overlap_scheduler] SKIP (https://nvbugs/5945047)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081)
-full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5948435)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] SKIP (https://nvbugs/5955792)
-full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/5961814)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_2_model_mtp SKIP (https://nvbugs/5966585)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] SKIP (https://nvbugs/5927636)
-full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5961814)
-test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] SKIP (https://nvbugs/5969725)
-accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_accuracy[nvfp4-4-attn_dp_on-trtllm] SKIP (https://nvbugs/5973199)
-unittest/disaggregated/test_agent_multi_backends.py::test_run_with_different_env[1] SKIP (https://nvbugs/5979673)
-verl/test_verl_cases.py::test_adapter SKIP (https://nvbugs/5981833)
-verl/test_verl_cases.py::test_async_server SKIP (https://nvbugs/5981833)
-verl/test_verl_cases.py::test_rollout_utils SKIP (https://nvbugs/5981833)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/5839028)
-accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput_trtllm] SKIP (https://nvbugs/5981293)
-accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_2_model_mtp[2model] SKIP (https://nvbugs/5981293)
-test_e2e.py::test_draft_token_tree_quickstart_advanced_eagle3_depth_1_tree[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] SKIP (https://nvbugs/5989907)
-test_e2e.py::test_draft_token_tree_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] SKIP (https://nvbugs/5989907)
-unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend[act=Relu2-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] SKIP (https://nvbugs/5989912)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_piecewise_cuda_graph[mtp3_fp8kv_chunked] SKIP (https://nvbugs/5989920)
-accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5992113)
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[use_temperature=False-attn_backend=TRTLLM] SKIP (https://nvbugs/5997547)
-accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8 SKIP (https://nvbugs/6004530)
-unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] SKIP (https://nvbugs/6007285)
-accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[latency] SKIP (https://nvbugs/6012526)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/6018046)
-test_fmha.py::test_fmha SKIP (https://nvbugs/6018058)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/6029882)
-accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_accuracy[bf16-4-attn_dp_off-trtllm] SKIP (https://nvbugs/5919796)
-accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_accuracy[fp8-4-attn_dp_off-trtllm] SKIP (https://nvbugs/6058066)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False-v2_kv_cache=False] SKIP (https://nvbugs/6027594)
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard[overlap_scheduler=True] SKIP (https://nvbugs/6037653)
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard[overlap_scheduler=False] SKIP (https://nvbugs/6037653)
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/6037654)
-examples/test_visual_gen.py::test_vbench_dimension_score_wan22_a14b_nvfp4 SKIP (https://nvbugs/6050483)
-visual_gen/test_visual_gen_benchmark.py::test_online_benchmark[openai-videos] SKIP (https://nvbugs/6050483)
-examples/test_visual_gen.py::test_vbench_dimension_score_wan SKIP (https://nvbugs/6050483)
-examples/test_visual_gen.py::test_vbench_dimension_score_wan22_a14b_fp8 SKIP (https://nvbugs/6050483)
+unittest/llmapi/test_llm_multi_gpu.py -m "gpu4 and part0" SKIP (https://nvbugs/5348958)
+unittest/llmapi/test_llm_multi_gpu_pytorch.py::test_phi3_lora_fused_modules_output_on_tp2_identical_to_tp1 SKIP (https://nvbugs/6109745)
+unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781)
+unittest/tools/test_layer_wise_benchmarks.py::test_performance_alignment[1] SKIP (https://nvbugs/6127669)
+unittest/tools/test_layer_wise_benchmarks.py::test_qwen3_next_gen_tep[1] SKIP (https://nvbugs/6153575)
 visual_gen/test_visual_gen_benchmark.py::test_offline_benchmark SKIP (https://nvbugs/6050483)
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[pp4-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/6050487)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6050489)
-disaggregated/test_disaggregated.py::test_disaggregated_gpt_oss_120b_harmony[gpt_oss/gpt-oss-120b] SKIP (https://nvbugs/6011317)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6050489)
-disaggregated/test_disaggregated.py::test_disaggregated_overlap_gen_first[ctx_pp1-TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6057459)
-disaggregated/test_disaggregated.py::test_disaggregated_overlap_gen_first[ctx_pp4-TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6057460)
-perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] SKIP (https://nvbugs/6085022)
-full:sm89/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=False] SKIP (https://nvbugs/6070878)
-full:sm89/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=True] SKIP (https://nvbugs/6070878)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto] SKIP (https://nvbugs/6026676)
-accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus] SKIP (https://nvbugs/6069790)
-accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5981293)
-accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput] SKIP (https://nvbugs/5981293)
-disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6069686)
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6074784)
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx_tp1_single_gpu[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6074784)
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6074784)
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6074784)
-accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[google_gemma-3-1b-it-False] SKIP (https://nvbugs/6059036)
-accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[meta-llama_Llama-3.1-8B-Instruct-False] SKIP (https://nvbugs/6059036)
-accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[mistralai_Codestral-22B-v0.1-False] SKIP (https://nvbugs/6059036)
-accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[mistralai_Ministral-8B-Instruct-2410-False] SKIP (https://nvbugs/6059036)
-accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[Qwen_QwQ-32B-False] SKIP (https://nvbugs/6059036)
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/6050489)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6050489)
-accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[deepseek-ai_DeepSeek-R1-0528-True] SKIP (https://nvbugs/6070955)
-accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B_Instruct_Eagle3::test_eagle3_one_model SKIP (https://nvbugs/5997534)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6076560)
-accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[trtllm-flashinfer_ssm-False] SKIP (https://nvbugs/6076564)
-unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py SKIP (https://nvbugs/6076624)
-unittest/llmapi/test_llm_pytorch.py::test_llm_disagg_streaming_gen_cancelled SKIP (https://nvbugs/6078431)
-unittest/auto_deploy/singlegpu/transformations/library/test_mrope_delta_cache.py::test_qwen_registry_configs_explicitly_enable_mrope_delta_cache SKIP (https://nvbugs/6078421)
-accuracy/test_llm_api_autodeploy.py::TestQwen3_5_397B_MoE::test_nvfp4[8] SKIP (https://nvbugs/6080024)
-accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[meta-llama_Llama-3.3-70B-Instruct-False] SKIP (https://nvbugs/6059036)
-full:DGX_H100/unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_reducescatter_pg_op[var_len:True-seqlen:64-hidden:128] SKIP (https://nvbugs/6080037)
-llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_eagle3 SKIP (https://nvbugs/6075431)
-llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/6079440)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6079919)
-disaggregated/test_disaggregated.py::test_disaggregated_benchmark_gen_only_insufficient_kv[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632)
-disaggregated/test_disaggregated.py::test_disaggregated_conditional[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632)
-disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632)
-disaggregated/test_disaggregated.py::test_disaggregated_kv_cache_time_output[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632)
-disaggregated/test_disaggregated.py::test_disaggregated_ngram[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632)
-disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632)
-disaggregated/test_disaggregated.py::test_disaggregated_single_gpu[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6084720)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] SKIP (https://nvbugs/6084764)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] SKIP (https://nvbugs/6084775)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6084824)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True] SKIP (https://nvbugs/6084445)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=0] SKIP (https://nvbugs/6084447)
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] SKIP (https://nvbugs/6084568)
-perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/6088149)
-accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/6070857)
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/6094071)
-accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray SKIP (https://nvbugs/6094070)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_dummy_load_format SKIP (https://nvbugs/6094072)
-cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-nixl_kvcache-90] SKIP (https://nvbugs/6093820)
-cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-ucx_kvcache-90] SKIP (https://nvbugs/6093820)
-accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[fp8-1-trtllm] SKIP (https://nvbugs/6094208)
-accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-1-trtllm] SKIP (https://nvbugs/6094208)
-accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[torch-True-1] SKIP (https://nvbugs/6093714)
-accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-True] SKIP (https://nvbugs/6093713)
-accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-False] SKIP (https://nvbugs/6093713)
-accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_gen_first[ctx_tp1pp1-gen_tp1pp1] SKIP (https://nvbugs/6093712)
-accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap_adp_on] SKIP (https://nvbugs/6094068)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9-fp8kv=True] SKIP (https://nvbugs/6094066)
-accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[nvidia_Llama-3.1-8B-Instruct-NVFP4-True] SKIP (https://nvbugs/6093715)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6084447)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6084447)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=2] SKIP (https://nvbugs/6084447)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=True] SKIP (https://nvbugs/6084447)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6084447)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=True] SKIP (https://nvbugs/6084447)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6084447)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6084447)
-disaggregated/test_auto_scaling.py::test_service_discovery[etcd-kv_cache_aware] SKIP (https://nvbugs/6094100)
-disaggregated/test_disaggregated.py::test_disaggregated_chat_completion_tool_calls[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_service_discovery[http-load_balancing] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin] SKIP (https://nvbugs/6094100)
-disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_worker_restart[http-round_robin] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_service_discovery[etcd-load_balancing] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/6094100)
-disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_gentp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
-disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/6094100)
-disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_trt_backend[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
-disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
-disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_genpp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/6094100)
-disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_trt_backend[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
-disaggregated/test_workers.py::test_workers_conditional_disaggregation_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_service_discovery[http-kv_cache_aware] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin] SKIP (https://nvbugs/6094100)
-disaggregated/test_disaggregated.py::test_disaggregated_diff_max_tokens[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
-disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_worker_restart[etcd-kv_cache_aware] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_disagg_server_restart[http-round_robin] SKIP (https://nvbugs/6094100)
-disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
-disaggregated/test_disaggregated.py::test_disaggregated_perf_metrics[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_worker_restart[http-load_balancing] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_worker_restart[http-kv_cache_aware] SKIP (https://nvbugs/6094100)
-disaggregated/test_auto_scaling.py::test_service_discovery[http-round_robin] SKIP (https://nvbugs/6094100)
-disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_llama_context_capacity[False-False-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/6095421)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6095421)
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/6095421)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/6095421)
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[pp4-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/6095421)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6095421)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6095421)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM] SKIP (https://nvbugs/6095421)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6095421)
-perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_tep8_mtp3] SKIP (https://nvbugs/6095700)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6095851)
-full:DGX_B200/examples/test_visual_gen.py::test_wan_t2v_example SKIP (https://nvbugs/6097980)
-accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/6098442)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6098790)
+visual_gen/test_visual_gen_benchmark.py::test_online_benchmark[openai-videos] SKIP (https://nvbugs/6050483)

From 209630d412d916156e8bd2d46ef3728b8020917a Mon Sep 17 00:00:00 2001
From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
Date: Sun, 7 Jun 2026 21:07:12 -0700
Subject: [PATCH 2/2] [TensorRT-LLM][SpecDec] Capture count penalty in CUDA
 graphs

CUDA graph warmup can capture speculative sampling without the generated-token count frequency-penalty path when warmup requests have no frequency penalty. Later RWLT GPT-OSS disagg requests replay that graph with frequency_penalty and prompt_ignore_length, so repeated generated tokens are not penalized.

Add speculative logits penalty CUDA ops, preserve sequence-slot count state across CUDA graph metadata/replay, append accepted tokens back into count state, and gate forced graph count capture to the disaggregated generation role by default.

Validation: python3 -m py_compile on modified Python modules; git diff --cached --check; GPT-OSS disagg original NVBug config ran 8 total auto-gating runs with >10k=0 and 16K/length=0.
---
 .../logitsPenaltyKernels.cu                   |  498 +++++++
 .../logitsPenaltyKernels.h                    |   56 +
 cpp/tensorrt_llm/thop/CMakeLists.txt          |    1 +
 .../thop/speculativeLogitsPenaltyOp.cpp       |  473 ++++++
 .../_torch/pyexecutor/model_engine.py         |  114 +-
 tensorrt_llm/_torch/speculative/interface.py  | 1297 ++++++++++++++++-
 .../_torch/speculative/one_model_sampler.py   |  101 ++
 .../_torch/speculative/spec_sampler_base.py   |  432 +++++-
 tensorrt_llm/_torch/speculative/utils.py      |   12 +-
 9 files changed, 2966 insertions(+), 18 deletions(-)
 create mode 100644 cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.cu
 create mode 100644 cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.h
 create mode 100644 cpp/tensorrt_llm/thop/speculativeLogitsPenaltyOp.cpp

diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.cu
new file mode 100644
index 00000000000..695748b1bdc
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.cu
@@ -0,0 +1,498 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.h"
+
+#include <algorithm>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace kernels
+{
+
+template <typename T, typename TokenT>
+__global__ void applySpeculativeTokenPenaltiesKernel(
+    T* logits, TokenT const* tokenIds, float const* penaltyValues, int32_t width, int32_t vocabSize)
+{
+    auto const row = static_cast<int32_t>(blockIdx.x);
+    auto const rowTokenIds = tokenIds + row * width;
+    auto const rowPenaltyValues = penaltyValues + row * width;
+    auto rowLogits = logits + row * vocabSize;
+
+    if (width <= 32)
+    {
+        if (threadIdx.x != 0)
+        {
+            return;
+        }
+        for (auto idx = 0; idx < width; ++idx)
+        {
+            auto const tokenId = static_cast<int64_t>(rowTokenIds[idx]);
+            auto const penalty = rowPenaltyValues[idx];
+            if (penalty != 0.0f && tokenId >= 0 && tokenId < vocabSize)
+            {
+                auto const offset = static_cast<int32_t>(tokenId);
+                auto logit = static_cast<float>(rowLogits[offset]);
+                logit -= penalty;
+                rowLogits[offset] = static_cast<T>(logit);
+            }
+        }
+        return;
+    }
+
+    for (auto idx = static_cast<int32_t>(threadIdx.x); idx < width; idx += static_cast<int32_t>(blockDim.x))
+    {
+        auto const tokenId = static_cast<int64_t>(rowTokenIds[idx]);
+        auto const penalty = rowPenaltyValues[idx];
+        if (penalty != 0.0f && tokenId >= 0 && tokenId < vocabSize)
+        {
+            auto const offset = static_cast<int32_t>(tokenId);
+            auto logit = static_cast<float>(rowLogits[offset]);
+            logit -= penalty;
+            rowLogits[offset] = static_cast<T>(logit);
+        }
+    }
+}
+
+template <typename T, typename TokenT>
+void invokeApplySpeculativeTokenPenalties(T* logits, TokenT const* tokenIds, float const* penaltyValues, int32_t numRows,
+    int32_t width, int32_t vocabSize, cudaStream_t stream)
+{
+    if (numRows == 0 || width == 0)
+    {
+        return;
+    }
+
+    dim3 const grid(numRows);
+    dim3 const block(std::min(width, 256));
+    applySpeculativeTokenPenaltiesKernel<T, TokenT><<<grid, block, 0, stream>>>(
+        logits, tokenIds, penaltyValues, width, vocabSize);
+}
+
+template void invokeApplySpeculativeTokenPenalties<float, int32_t>(
+    float*, int32_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t);
+template void invokeApplySpeculativeTokenPenalties<float, int64_t>(
+    float*, int64_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t);
+template void invokeApplySpeculativeTokenPenalties<half, int32_t>(
+    half*, int32_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t);
+template void invokeApplySpeculativeTokenPenalties<half, int64_t>(
+    half*, int64_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t);
+template void invokeApplySpeculativeTokenPenalties<__nv_bfloat16, int32_t>(
+    __nv_bfloat16*, int32_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t);
+template void invokeApplySpeculativeTokenPenalties<__nv_bfloat16, int64_t>(
+    __nv_bfloat16*, int64_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t);
+
+__global__ void applySpeculativeHistoryFrequencyPenaltyKernel(float* logits, int32_t const* historyTokens,
+    int32_t const* historyLens, int32_t const* rowSlots, float const* frequencyPenalties, int32_t historyCapacity,
+    int32_t vocabSize)
+{
+    auto const row = static_cast<int32_t>(blockIdx.x);
+    auto const frequencyPenalty = frequencyPenalties[row];
+    if (frequencyPenalty == 0.0f)
+    {
+        return;
+    }
+
+    auto const slot = rowSlots[row];
+    if (slot < 0)
+    {
+        return;
+    }
+
+    auto const historyLen = min(max(historyLens[slot], 0), historyCapacity);
+    auto const rowHistory = historyTokens + static_cast<int64_t>(slot) * historyCapacity;
+    auto rowLogits = logits + static_cast<int64_t>(row) * vocabSize;
+
+    for (auto idx = static_cast<int32_t>(threadIdx.x); idx < historyLen; idx += static_cast<int32_t>(blockDim.x))
+    {
+        auto const tokenId = rowHistory[idx];
+        if (tokenId >= 0 && tokenId < vocabSize)
+        {
+            atomicAdd(rowLogits + tokenId, -frequencyPenalty);
+        }
+    }
+}
+
+void invokeApplySpeculativeHistoryFrequencyPenalty(float* logits, int32_t const* historyTokens,
+    int32_t const* historyLens, int32_t const* rowSlots, float const* frequencyPenalties, int32_t numRows,
+    int32_t historyCapacity, int32_t vocabSize, cudaStream_t stream)
+{
+    if (numRows == 0 || historyCapacity == 0)
+    {
+        return;
+    }
+
+    dim3 const grid(numRows);
+    dim3 const block(256);
+    applySpeculativeHistoryFrequencyPenaltyKernel<<<grid, block, 0, stream>>>(
+        logits, historyTokens, historyLens, rowSlots, frequencyPenalties, historyCapacity, vocabSize);
+}
+
+__global__ void appendSpeculativeAcceptedTokensKernel(int32_t* historyTokens, int32_t* historyLens,
+    int32_t const* seqSlots, int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t acceptedStride,
+    int32_t historyCapacity)
+{
+    auto const row = static_cast<int32_t>(blockIdx.x);
+    auto const slot = seqSlots[row];
+    if (slot < 0)
+    {
+        return;
+    }
+
+    auto const acceptedLen = max(acceptedLens[row], 0);
+    if (acceptedLen == 0)
+    {
+        return;
+    }
+
+    auto const oldLen = min(max(historyLens[slot], 0), historyCapacity);
+    auto const writeLen = min(acceptedLen, max(historyCapacity - oldLen, 0));
+    auto const rowAccepted = acceptedTokens + static_cast<int64_t>(row) * acceptedStride;
+    auto rowHistory = historyTokens + static_cast<int64_t>(slot) * historyCapacity;
+
+    for (auto idx = static_cast<int32_t>(threadIdx.x); idx < writeLen; idx += static_cast<int32_t>(blockDim.x))
+    {
+        rowHistory[oldLen + idx] = rowAccepted[idx];
+    }
+
+    if (threadIdx.x == 0)
+    {
+        historyLens[slot] = oldLen + writeLen;
+    }
+}
+
+void invokeAppendSpeculativeAcceptedTokens(int32_t* historyTokens, int32_t* historyLens, int32_t const* seqSlots,
+    int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t numRows, int32_t acceptedStride,
+    int32_t historyCapacity, cudaStream_t stream)
+{
+    if (numRows == 0 || acceptedStride == 0 || historyCapacity == 0)
+    {
+        return;
+    }
+
+    dim3 const grid(numRows);
+    dim3 const block(std::min(acceptedStride, 256));
+    appendSpeculativeAcceptedTokensKernel<<<grid, block, 0, stream>>>(
+        historyTokens, historyLens, seqSlots, acceptedTokens, acceptedLens, acceptedStride, historyCapacity);
+}
+
+template <typename T>
+__global__ void applySpeculativeCountFrequencyPenaltyKernel(T* logits, int32_t const* tokenCounts,
+    int32_t const* rowSlots, float const* frequencyPenalties, int32_t vocabSize)
+{
+    auto const row = static_cast<int32_t>(blockIdx.x);
+    auto const tokenId = static_cast<int32_t>(blockIdx.y) * static_cast<int32_t>(blockDim.x)
+        + static_cast<int32_t>(threadIdx.x);
+    if (tokenId >= vocabSize)
+    {
+        return;
+    }
+
+    auto const frequencyPenalty = frequencyPenalties[row];
+    if (frequencyPenalty == 0.0f)
+    {
+        return;
+    }
+
+    auto const slot = rowSlots[row];
+    if (slot < 0)
+    {
+        return;
+    }
+
+    auto const count = tokenCounts[static_cast<int64_t>(slot) * vocabSize + tokenId];
+    if (count <= 0)
+    {
+        return;
+    }
+
+    auto rowLogits = logits + static_cast<int64_t>(row) * vocabSize;
+    auto logit = static_cast<float>(rowLogits[tokenId]);
+    logit -= frequencyPenalty * static_cast<float>(count);
+    rowLogits[tokenId] = static_cast<T>(logit);
+}
+
+template <typename T>
+void invokeApplySpeculativeCountFrequencyPenalty(T* logits, int32_t const* tokenCounts,
+    int32_t const* rowSlots, float const* frequencyPenalties, int32_t numRows, int32_t vocabSize, cudaStream_t stream)
+{
+    if (numRows == 0 || vocabSize == 0)
+    {
+        return;
+    }
+
+    dim3 const block(256);
+    dim3 const grid(numRows, (vocabSize + static_cast<int32_t>(block.x) - 1) / static_cast<int32_t>(block.x));
+    applySpeculativeCountFrequencyPenaltyKernel<T><<<grid, block, 0, stream>>>(
+        logits, tokenCounts, rowSlots, frequencyPenalties, vocabSize);
+}
+
+template void invokeApplySpeculativeCountFrequencyPenalty<float>(
+    float*, int32_t const*, int32_t const*, float const*, int32_t, int32_t, cudaStream_t);
+template void invokeApplySpeculativeCountFrequencyPenalty<half>(
+    half*, int32_t const*, int32_t const*, float const*, int32_t, int32_t, cudaStream_t);
+template void invokeApplySpeculativeCountFrequencyPenalty<__nv_bfloat16>(
+    __nv_bfloat16*, int32_t const*, int32_t const*, float const*, int32_t, int32_t, cudaStream_t);
+
+__global__ void appendSpeculativeAcceptedTokenCountsKernel(int32_t* tokenCounts, int32_t const* seqSlots,
+    int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t acceptedStride, int32_t vocabSize)
+{
+    auto const row = static_cast<int32_t>(blockIdx.x);
+    auto const slot = seqSlots[row];
+    if (slot < 0)
+    {
+        return;
+    }
+
+    auto const acceptedLen = min(max(acceptedLens[row], 0), acceptedStride);
+    auto const rowAccepted = acceptedTokens + static_cast<int64_t>(row) * acceptedStride;
+    auto rowCounts = tokenCounts + static_cast<int64_t>(slot) * vocabSize;
+
+    for (auto idx = static_cast<int32_t>(threadIdx.x); idx < acceptedLen; idx += static_cast<int32_t>(blockDim.x))
+    {
+        auto const tokenId = rowAccepted[idx];
+        if (tokenId >= 0 && tokenId < vocabSize)
+        {
+            atomicAdd(rowCounts + tokenId, 1);
+        }
+    }
+}
+
+void invokeAppendSpeculativeAcceptedTokenCounts(int32_t* tokenCounts, int32_t const* seqSlots,
+    int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t numRows, int32_t acceptedStride,
+    int32_t vocabSize, cudaStream_t stream)
+{
+    if (numRows == 0 || acceptedStride == 0 || vocabSize == 0)
+    {
+        return;
+    }
+
+    dim3 const grid(numRows);
+    dim3 const block(std::min(acceptedStride, 256));
+    appendSpeculativeAcceptedTokenCountsKernel<<<grid, block, 0, stream>>>(
+        tokenCounts, seqSlots, acceptedTokens, acceptedLens, acceptedStride, vocabSize);
+}
+
+template <typename T>
+__global__ void applySpeculativeSparseCountFrequencyPenaltyKernel(T* logits, int32_t const* tokenIds,
+    int32_t const* tokenCounts, int32_t const* countLens, int32_t const* rowSlots, float const* frequencyPenalties,
+    int32_t numRows, int32_t countCapacity, int32_t vocabSize)
+{
+    auto const row = static_cast<int32_t>(blockIdx.x);
+    auto const frequencyPenalty = frequencyPenalties[row];
+    if (frequencyPenalty == 0.0f)
+    {
+        return;
+    }
+
+    auto const slot = rowSlots[row];
+    if (slot < 0)
+    {
+        return;
+    }
+
+    if (row > 0 && rowSlots[row - 1] == slot && frequencyPenalties[row - 1] == frequencyPenalty)
+    {
+        return;
+    }
+
+    auto rowEnd = row + 1;
+    while (rowEnd < numRows && rowSlots[rowEnd] == slot && frequencyPenalties[rowEnd] == frequencyPenalty)
+    {
+        ++rowEnd;
+    }
+
+    auto const countLen = min(max(countLens[slot], 0), countCapacity);
+    auto const rowTokenIds = tokenIds + static_cast<int64_t>(slot) * countCapacity;
+    auto const rowTokenCounts = tokenCounts + static_cast<int64_t>(slot) * countCapacity;
+
+    for (auto idx = static_cast<int32_t>(threadIdx.x); idx < countLen; idx += static_cast<int32_t>(blockDim.x))
+    {
+        auto const tokenId = rowTokenIds[idx];
+        auto const count = rowTokenCounts[idx];
+        if (count > 0 && tokenId >= 0 && tokenId < vocabSize)
+        {
+            for (auto applyRow = row; applyRow < rowEnd; ++applyRow)
+            {
+                auto rowLogits = logits + static_cast<int64_t>(applyRow) * vocabSize;
+                auto logit = static_cast<float>(rowLogits[tokenId]);
+                logit -= frequencyPenalty * static_cast<float>(count);
+                rowLogits[tokenId] = static_cast<T>(logit);
+            }
+        }
+    }
+}
+
+template <typename T>
+void invokeApplySpeculativeSparseCountFrequencyPenalty(T* logits, int32_t const* tokenIds,
+    int32_t const* tokenCounts, int32_t const* countLens, int32_t const* rowSlots, float const* frequencyPenalties,
+    int32_t numRows, int32_t countCapacity, int32_t vocabSize, cudaStream_t stream)
+{
+    if (numRows == 0 || countCapacity == 0 || vocabSize == 0)
+    {
+        return;
+    }
+
+    dim3 const grid(numRows);
+    dim3 const block(256);
+    applySpeculativeSparseCountFrequencyPenaltyKernel<T><<<grid, block, 0, stream>>>(logits, tokenIds, tokenCounts,
+        countLens, rowSlots, frequencyPenalties, numRows, countCapacity, vocabSize);
+}
+
+template void invokeApplySpeculativeSparseCountFrequencyPenalty<float>(float*, int32_t const*, int32_t const*,
+    int32_t const*, int32_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t);
+template void invokeApplySpeculativeSparseCountFrequencyPenalty<half>(half*, int32_t const*, int32_t const*,
+    int32_t const*, int32_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t);
+template void invokeApplySpeculativeSparseCountFrequencyPenalty<__nv_bfloat16>(__nv_bfloat16*, int32_t const*,
+    int32_t const*, int32_t const*, int32_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t);
+
+__global__ void appendSpeculativeSparseTokenCountsKernel(int32_t* tokenIds, int32_t* tokenCounts, int32_t* countLens,
+    int32_t const* seqSlots, int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t acceptedStride,
+    int32_t countCapacity, int32_t vocabSize)
+{
+    auto const row = static_cast<int32_t>(blockIdx.x);
+    auto const slot = seqSlots[row];
+    if (slot < 0)
+    {
+        return;
+    }
+
+    auto len = min(max(countLens[slot], 0), countCapacity);
+    auto rowTokenIds = tokenIds + static_cast<int64_t>(slot) * countCapacity;
+    auto rowTokenCounts = tokenCounts + static_cast<int64_t>(slot) * countCapacity;
+    auto const rowAccepted = acceptedTokens + static_cast<int64_t>(row) * acceptedStride;
+    auto const acceptedLen = min(max(acceptedLens[row], 0), acceptedStride);
+
+    __shared__ int32_t lenShared;
+    __shared__ int32_t tokenIdShared;
+    __shared__ int32_t foundIdx;
+
+    if (threadIdx.x == 0)
+    {
+        lenShared = len;
+    }
+    __syncthreads();
+
+    for (auto acceptedIdx = 0; acceptedIdx < acceptedLen; ++acceptedIdx)
+    {
+        if (threadIdx.x == 0)
+        {
+            tokenIdShared = rowAccepted[acceptedIdx];
+            foundIdx = -1;
+        }
+        __syncthreads();
+
+        auto const tokenId = tokenIdShared;
+        if (tokenId >= 0 && tokenId < vocabSize)
+        {
+            auto const currentLen = lenShared;
+            for (auto idx = static_cast<int32_t>(threadIdx.x); idx < currentLen;
+                 idx += static_cast<int32_t>(blockDim.x))
+            {
+                if (rowTokenIds[idx] == tokenId)
+                {
+                    atomicCAS(&foundIdx, -1, idx);
+                }
+            }
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                if (foundIdx >= 0)
+                {
+                    rowTokenCounts[foundIdx] += 1;
+                }
+                else if (lenShared < countCapacity)
+                {
+                    rowTokenIds[lenShared] = tokenId;
+                    rowTokenCounts[lenShared] = 1;
+                    ++lenShared;
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    if (threadIdx.x == 0)
+    {
+        countLens[slot] = lenShared;
+    }
+}
+
+void invokeAppendSpeculativeSparseTokenCounts(int32_t* tokenIds, int32_t* tokenCounts, int32_t* countLens,
+    int32_t const* seqSlots, int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t numRows,
+    int32_t acceptedStride, int32_t countCapacity, int32_t vocabSize, cudaStream_t stream)
+{
+    if (numRows == 0 || acceptedStride == 0 || countCapacity == 0 || vocabSize == 0)
+    {
+        return;
+    }
+
+    dim3 const grid(numRows);
+    dim3 const block(std::min(countCapacity, 256));
+    appendSpeculativeSparseTokenCountsKernel<<<grid, block, 0, stream>>>(tokenIds, tokenCounts, countLens, seqSlots,
+        acceptedTokens, acceptedLens, acceptedStride, countCapacity, vocabSize);
+}
+
+__global__ void initSpeculativeSparseTokenCountsKernel(int32_t* tokenIds, int32_t* tokenCounts, int32_t* countLens,
+    int32_t const* promptTokenIds, int32_t const* promptTokenCounts, int32_t const* promptLens,
+    int32_t const* seqSlots, int32_t promptCapacity, int32_t countCapacity, int32_t vocabSize)
+{
+    auto const row = static_cast<int32_t>(blockIdx.x);
+    auto const slot = seqSlots[row];
+    if (slot < 0)
+    {
+        return;
+    }
+
+    auto const len = min(min(max(promptLens[row], 0), promptCapacity), countCapacity);
+    auto rowTokenIds = tokenIds + static_cast<int64_t>(slot) * countCapacity;
+    auto rowTokenCounts = tokenCounts + static_cast<int64_t>(slot) * countCapacity;
+    auto const rowPromptTokenIds = promptTokenIds + static_cast<int64_t>(row) * promptCapacity;
+    auto const rowPromptTokenCounts = promptTokenCounts + static_cast<int64_t>(row) * promptCapacity;
+
+    for (auto idx = static_cast<int32_t>(threadIdx.x); idx < len; idx += static_cast<int32_t>(blockDim.x))
+    {
+        auto const tokenId = rowPromptTokenIds[idx];
+        auto const count = rowPromptTokenCounts[idx];
+        if (tokenId >= 0 && tokenId < vocabSize && count > 0)
+        {
+            rowTokenIds[idx] = tokenId;
+            rowTokenCounts[idx] = count;
+        }
+        else
+        {
+            rowTokenIds[idx] = 0;
+            rowTokenCounts[idx] = 0;
+        }
+    }
+
+    if (threadIdx.x == 0)
+    {
+        countLens[slot] = len;
+    }
+}
+
+void invokeInitSpeculativeSparseTokenCounts(int32_t* tokenIds, int32_t* tokenCounts, int32_t* countLens,
+    int32_t const* promptTokenIds, int32_t const* promptTokenCounts, int32_t const* promptLens,
+    int32_t const* seqSlots, int32_t numRows, int32_t promptCapacity, int32_t countCapacity, int32_t vocabSize,
+    cudaStream_t stream)
+{
+    if (numRows == 0 || promptCapacity == 0 || countCapacity == 0 || vocabSize == 0)
+    {
+        return;
+    }
+
+    dim3 const grid(numRows);
+    dim3 const block(std::min(promptCapacity, 256));
+    initSpeculativeSparseTokenCountsKernel<<<grid, block, 0, stream>>>(tokenIds, tokenCounts, countLens, promptTokenIds,
+        promptTokenCounts, promptLens, seqSlots, promptCapacity, countCapacity, vocabSize);
+}
+
+} // namespace kernels
+
+TRTLLM_NAMESPACE_END
diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.h
new file mode 100644
index 00000000000..b9e7bd599d8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.h
@@ -0,0 +1,56 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "tensorrt_llm/common/config.h"
+#include "tensorrt_llm/common/cudaDriverWrapper.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+
+#include <cuda_runtime.h>
+#include <cstdint>
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace kernels
+{
+
+template <typename T, typename TokenT>
+void invokeApplySpeculativeTokenPenalties(T* logits, TokenT const* tokenIds, float const* penaltyValues, int32_t numRows,
+    int32_t width, int32_t vocabSize, cudaStream_t stream);
+
+void invokeApplySpeculativeHistoryFrequencyPenalty(float* logits, int32_t const* historyTokens,
+    int32_t const* historyLens, int32_t const* rowSlots, float const* frequencyPenalties, int32_t numRows,
+    int32_t historyCapacity, int32_t vocabSize, cudaStream_t stream);
+
+void invokeAppendSpeculativeAcceptedTokens(int32_t* historyTokens, int32_t* historyLens, int32_t const* seqSlots,
+    int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t numRows, int32_t acceptedStride,
+    int32_t historyCapacity, cudaStream_t stream);
+
+template <typename T>
+void invokeApplySpeculativeCountFrequencyPenalty(T* logits, int32_t const* tokenCounts,
+    int32_t const* rowSlots, float const* frequencyPenalties, int32_t numRows, int32_t vocabSize, cudaStream_t stream);
+
+void invokeAppendSpeculativeAcceptedTokenCounts(int32_t* tokenCounts, int32_t const* seqSlots,
+    int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t numRows, int32_t acceptedStride,
+    int32_t vocabSize, cudaStream_t stream);
+
+template <typename T>
+void invokeApplySpeculativeSparseCountFrequencyPenalty(T* logits, int32_t const* tokenIds,
+    int32_t const* tokenCounts, int32_t const* countLens, int32_t const* rowSlots, float const* frequencyPenalties,
+    int32_t numRows, int32_t countCapacity, int32_t vocabSize, cudaStream_t stream);
+
+void invokeAppendSpeculativeSparseTokenCounts(int32_t* tokenIds, int32_t* tokenCounts, int32_t* countLens,
+    int32_t const* seqSlots, int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t numRows,
+    int32_t acceptedStride, int32_t countCapacity, int32_t vocabSize, cudaStream_t stream);
+
+void invokeInitSpeculativeSparseTokenCounts(int32_t* tokenIds, int32_t* tokenCounts, int32_t* countLens,
+    int32_t const* promptTokenIds, int32_t const* promptTokenCounts, int32_t const* promptLens,
+    int32_t const* seqSlots, int32_t numRows, int32_t promptCapacity, int32_t countCapacity, int32_t vocabSize,
+    cudaStream_t stream);
+
+} // namespace kernels
+
+TRTLLM_NAMESPACE_END
diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt
index 0d077c4b030..16fdccf207e 100644
--- a/cpp/tensorrt_llm/thop/CMakeLists.txt
+++ b/cpp/tensorrt_llm/thop/CMakeLists.txt
@@ -108,6 +108,7 @@ add_library(
   weightOnlyQuantGemm.cpp
   weightOnlyQuantOp.cpp
   specDecOp.cpp
+  speculativeLogitsPenaltyOp.cpp
   loraOp.cpp
   finegrained_mixed_dtype_gemm_thop.cpp
   tinygemm2.cpp
diff --git a/cpp/tensorrt_llm/thop/speculativeLogitsPenaltyOp.cpp b/cpp/tensorrt_llm/thop/speculativeLogitsPenaltyOp.cpp
new file mode 100644
index 00000000000..082748f3cc8
--- /dev/null
+++ b/cpp/tensorrt_llm/thop/speculativeLogitsPenaltyOp.cpp
@@ -0,0 +1,473 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.h"
+#include "tensorrt_llm/thop/thUtils.h"
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace th = torch;
+namespace tk = tensorrt_llm::kernels;
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace torch_ext
+{
+
+namespace
+{
+
+template <typename T, typename TokenT>
+void invokeTypedApplyTokenPenalties(th::Tensor& logits, th::Tensor const& tokenIds, th::Tensor const& penaltyValues)
+{
+    auto stream = at::cuda::getCurrentCUDAStream(logits.get_device()).stream();
+    tk::invokeApplySpeculativeTokenPenalties<T, TokenT>(reinterpret_cast<T*>(logits.data_ptr()),
+        reinterpret_cast<TokenT const*>(tokenIds.data_ptr()), reinterpret_cast<float const*>(penaltyValues.data_ptr()),
+        static_cast<int32_t>(logits.size(0)), static_cast<int32_t>(tokenIds.size(1)),
+        static_cast<int32_t>(logits.size(1)), stream);
+}
+
+template <typename T>
+void dispatchTokenType(th::Tensor& logits, th::Tensor const& tokenIds, th::Tensor const& penaltyValues)
+{
+    switch (tokenIds.scalar_type())
+    {
+    case torch::kInt:
+        invokeTypedApplyTokenPenalties<T, int32_t>(logits, tokenIds, penaltyValues);
+        break;
+    case torch::kLong:
+        invokeTypedApplyTokenPenalties<T, int64_t>(logits, tokenIds, penaltyValues);
+        break;
+    default:
+        TORCH_CHECK(false, "token_ids dtype must be int32 or int64.");
+    }
+}
+
+template <typename T>
+void invokeTypedApplyCountFrequencyPenalty(
+    th::Tensor& logits, th::Tensor const& tokenCounts, th::Tensor const& rowSlots, th::Tensor const& frequencyPenalties)
+{
+    auto stream = at::cuda::getCurrentCUDAStream(logits.get_device()).stream();
+    tk::invokeApplySpeculativeCountFrequencyPenalty<T>(reinterpret_cast<T*>(logits.data_ptr()),
+        reinterpret_cast<int32_t const*>(tokenCounts.data_ptr()), reinterpret_cast<int32_t const*>(rowSlots.data_ptr()),
+        reinterpret_cast<float const*>(frequencyPenalties.data_ptr()), static_cast<int32_t>(logits.size(0)),
+        static_cast<int32_t>(logits.size(1)), stream);
+}
+
+template <typename T>
+void invokeTypedApplySparseCountFrequencyPenalty(th::Tensor& logits, th::Tensor const& tokenIds,
+    th::Tensor const& tokenCounts, th::Tensor const& countLens, th::Tensor const& rowSlots,
+    th::Tensor const& frequencyPenalties)
+{
+    auto stream = at::cuda::getCurrentCUDAStream(logits.get_device()).stream();
+    tk::invokeApplySpeculativeSparseCountFrequencyPenalty<T>(reinterpret_cast<T*>(logits.data_ptr()),
+        reinterpret_cast<int32_t const*>(tokenIds.data_ptr()), reinterpret_cast<int32_t const*>(tokenCounts.data_ptr()),
+        reinterpret_cast<int32_t const*>(countLens.data_ptr()), reinterpret_cast<int32_t const*>(rowSlots.data_ptr()),
+        reinterpret_cast<float const*>(frequencyPenalties.data_ptr()), static_cast<int32_t>(logits.size(0)),
+        static_cast<int32_t>(tokenIds.size(1)), static_cast<int32_t>(logits.size(1)), stream);
+}
+
+} // namespace
+
+void speculativeApplyTokenPenalties(th::Tensor& logits, th::Tensor const& tokenIds, th::Tensor const& penaltyValues)
+{
+    TORCH_CHECK(logits.is_cuda(), "logits must be a CUDA tensor.");
+    TORCH_CHECK(logits.is_contiguous(), "logits must be contiguous.");
+    TORCH_CHECK(logits.dim() == 2, "logits must be a 2D tensor.");
+    TORCH_CHECK(tokenIds.is_cuda(), "token_ids must be a CUDA tensor.");
+    TORCH_CHECK(tokenIds.is_contiguous(), "token_ids must be contiguous.");
+    TORCH_CHECK(tokenIds.dim() == 2, "token_ids must be a 2D tensor.");
+    TORCH_CHECK(penaltyValues.is_cuda(), "penalty_values must be a CUDA tensor.");
+    TORCH_CHECK(penaltyValues.is_contiguous(), "penalty_values must be contiguous.");
+    TORCH_CHECK(penaltyValues.dim() == 2, "penalty_values must be a 2D tensor.");
+    TORCH_CHECK(penaltyValues.scalar_type() == torch::kFloat, "penalty_values dtype must be float32.");
+    TORCH_CHECK(tokenIds.size(0) == logits.size(0), "token_ids and logits must have the same row count.");
+    TORCH_CHECK(penaltyValues.size(0) == logits.size(0), "penalty_values and logits must have the same row count.");
+    TORCH_CHECK(penaltyValues.size(1) == tokenIds.size(1), "penalty_values and token_ids widths must match.");
+
+    if (logits.size(0) == 0 || tokenIds.size(1) == 0)
+    {
+        return;
+    }
+
+    switch (logits.scalar_type())
+    {
+    case torch::kFloat32:
+        dispatchTokenType<float>(logits, tokenIds, penaltyValues);
+        break;
+    case torch::kFloat16:
+        dispatchTokenType<half>(logits, tokenIds, penaltyValues);
+        break;
+    case torch::kBFloat16:
+        dispatchTokenType<__nv_bfloat16>(logits, tokenIds, penaltyValues);
+        break;
+    default:
+        TORCH_CHECK(false, "logits dtype must be float32, float16, or bfloat16.");
+    }
+}
+
+void speculativeApplyHistoryFrequencyPenalty(th::Tensor& logits, th::Tensor const& historyTokens,
+    th::Tensor const& historyLens, th::Tensor const& rowSlots, th::Tensor const& frequencyPenalties)
+{
+    TORCH_CHECK(logits.is_cuda(), "logits must be a CUDA tensor.");
+    TORCH_CHECK(logits.is_contiguous(), "logits must be contiguous.");
+    TORCH_CHECK(logits.dim() == 2, "logits must be a 2D tensor.");
+    TORCH_CHECK(logits.scalar_type() == torch::kFloat32, "logits dtype must be float32.");
+    TORCH_CHECK(historyTokens.is_cuda(), "history_tokens must be a CUDA tensor.");
+    TORCH_CHECK(historyTokens.is_contiguous(), "history_tokens must be contiguous.");
+    TORCH_CHECK(historyTokens.dim() == 2, "history_tokens must be a 2D tensor.");
+    TORCH_CHECK(historyTokens.scalar_type() == torch::kInt, "history_tokens dtype must be int32.");
+    TORCH_CHECK(historyLens.is_cuda(), "history_lens must be a CUDA tensor.");
+    TORCH_CHECK(historyLens.is_contiguous(), "history_lens must be contiguous.");
+    TORCH_CHECK(historyLens.dim() == 1, "history_lens must be a 1D tensor.");
+    TORCH_CHECK(historyLens.scalar_type() == torch::kInt, "history_lens dtype must be int32.");
+    TORCH_CHECK(rowSlots.is_cuda(), "row_slots must be a CUDA tensor.");
+    TORCH_CHECK(rowSlots.is_contiguous(), "row_slots must be contiguous.");
+    TORCH_CHECK(rowSlots.dim() == 1, "row_slots must be a 1D tensor.");
+    TORCH_CHECK(rowSlots.scalar_type() == torch::kInt, "row_slots dtype must be int32.");
+    TORCH_CHECK(frequencyPenalties.is_cuda(), "frequency_penalties must be a CUDA tensor.");
+    TORCH_CHECK(frequencyPenalties.is_contiguous(), "frequency_penalties must be contiguous.");
+    TORCH_CHECK(frequencyPenalties.dim() == 1, "frequency_penalties must be a 1D tensor.");
+    TORCH_CHECK(frequencyPenalties.scalar_type() == torch::kFloat, "frequency_penalties dtype must be float32.");
+    TORCH_CHECK(rowSlots.size(0) == logits.size(0), "row_slots and logits must have the same row count.");
+    TORCH_CHECK(
+        frequencyPenalties.size(0) == logits.size(0), "frequency_penalties and logits must have the same row count.");
+    TORCH_CHECK(historyLens.size(0) == historyTokens.size(0), "history_lens and history_tokens slot count mismatch.");
+
+    if (logits.size(0) == 0 || historyTokens.size(1) == 0)
+    {
+        return;
+    }
+
+    auto stream = at::cuda::getCurrentCUDAStream(logits.get_device()).stream();
+    tk::invokeApplySpeculativeHistoryFrequencyPenalty(reinterpret_cast<float*>(logits.data_ptr()),
+        reinterpret_cast<int32_t const*>(historyTokens.data_ptr()), reinterpret_cast<int32_t const*>(historyLens.data_ptr()),
+        reinterpret_cast<int32_t const*>(rowSlots.data_ptr()),
+        reinterpret_cast<float const*>(frequencyPenalties.data_ptr()), static_cast<int32_t>(logits.size(0)),
+        static_cast<int32_t>(historyTokens.size(1)), static_cast<int32_t>(logits.size(1)), stream);
+}
+
+void speculativeAppendAcceptedTokens(th::Tensor& historyTokens, th::Tensor& historyLens, th::Tensor const& seqSlots,
+    th::Tensor const& acceptedTokens, th::Tensor const& acceptedLens)
+{
+    TORCH_CHECK(historyTokens.is_cuda(), "history_tokens must be a CUDA tensor.");
+    TORCH_CHECK(historyTokens.is_contiguous(), "history_tokens must be contiguous.");
+    TORCH_CHECK(historyTokens.dim() == 2, "history_tokens must be a 2D tensor.");
+    TORCH_CHECK(historyTokens.scalar_type() == torch::kInt, "history_tokens dtype must be int32.");
+    TORCH_CHECK(historyLens.is_cuda(), "history_lens must be a CUDA tensor.");
+    TORCH_CHECK(historyLens.is_contiguous(), "history_lens must be contiguous.");
+    TORCH_CHECK(historyLens.dim() == 1, "history_lens must be a 1D tensor.");
+    TORCH_CHECK(historyLens.scalar_type() == torch::kInt, "history_lens dtype must be int32.");
+    TORCH_CHECK(seqSlots.is_cuda(), "seq_slots must be a CUDA tensor.");
+    TORCH_CHECK(seqSlots.is_contiguous(), "seq_slots must be contiguous.");
+    TORCH_CHECK(seqSlots.dim() == 1, "seq_slots must be a 1D tensor.");
+    TORCH_CHECK(seqSlots.scalar_type() == torch::kInt, "seq_slots dtype must be int32.");
+    TORCH_CHECK(acceptedTokens.is_cuda(), "accepted_tokens must be a CUDA tensor.");
+    TORCH_CHECK(acceptedTokens.is_contiguous(), "accepted_tokens must be contiguous.");
+    TORCH_CHECK(acceptedTokens.dim() == 2, "accepted_tokens must be a 2D tensor.");
+    TORCH_CHECK(acceptedTokens.scalar_type() == torch::kInt, "accepted_tokens dtype must be int32.");
+    TORCH_CHECK(acceptedLens.is_cuda(), "accepted_lens must be a CUDA tensor.");
+    TORCH_CHECK(acceptedLens.is_contiguous(), "accepted_lens must be contiguous.");
+    TORCH_CHECK(acceptedLens.dim() == 1, "accepted_lens must be a 1D tensor.");
+    TORCH_CHECK(acceptedLens.scalar_type() == torch::kInt, "accepted_lens dtype must be int32.");
+    TORCH_CHECK(seqSlots.size(0) == acceptedTokens.size(0), "seq_slots and accepted_tokens row count mismatch.");
+    TORCH_CHECK(acceptedLens.size(0) == acceptedTokens.size(0), "accepted_lens and accepted_tokens row count mismatch.");
+    TORCH_CHECK(historyLens.size(0) == historyTokens.size(0), "history_lens and history_tokens slot count mismatch.");
+
+    if (acceptedTokens.size(0) == 0 || acceptedTokens.size(1) == 0 || historyTokens.size(1) == 0)
+    {
+        return;
+    }
+
+    auto stream = at::cuda::getCurrentCUDAStream(historyTokens.get_device()).stream();
+    tk::invokeAppendSpeculativeAcceptedTokens(reinterpret_cast<int32_t*>(historyTokens.data_ptr()),
+        reinterpret_cast<int32_t*>(historyLens.data_ptr()), reinterpret_cast<int32_t const*>(seqSlots.data_ptr()),
+        reinterpret_cast<int32_t const*>(acceptedTokens.data_ptr()), reinterpret_cast<int32_t const*>(acceptedLens.data_ptr()),
+        static_cast<int32_t>(acceptedTokens.size(0)), static_cast<int32_t>(acceptedTokens.size(1)),
+        static_cast<int32_t>(historyTokens.size(1)), stream);
+}
+
+void speculativeApplyCountFrequencyPenalty(th::Tensor& logits, th::Tensor const& tokenCounts,
+    th::Tensor const& rowSlots, th::Tensor const& frequencyPenalties)
+{
+    TORCH_CHECK(logits.is_cuda(), "logits must be a CUDA tensor.");
+    TORCH_CHECK(logits.is_contiguous(), "logits must be contiguous.");
+    TORCH_CHECK(logits.dim() == 2, "logits must be a 2D tensor.");
+    TORCH_CHECK(tokenCounts.is_cuda(), "token_counts must be a CUDA tensor.");
+    TORCH_CHECK(tokenCounts.is_contiguous(), "token_counts must be contiguous.");
+    TORCH_CHECK(tokenCounts.dim() == 2, "token_counts must be a 2D tensor.");
+    TORCH_CHECK(tokenCounts.scalar_type() == torch::kInt, "token_counts dtype must be int32.");
+    TORCH_CHECK(rowSlots.is_cuda(), "row_slots must be a CUDA tensor.");
+    TORCH_CHECK(rowSlots.is_contiguous(), "row_slots must be contiguous.");
+    TORCH_CHECK(rowSlots.dim() == 1, "row_slots must be a 1D tensor.");
+    TORCH_CHECK(rowSlots.scalar_type() == torch::kInt, "row_slots dtype must be int32.");
+    TORCH_CHECK(frequencyPenalties.is_cuda(), "frequency_penalties must be a CUDA tensor.");
+    TORCH_CHECK(frequencyPenalties.is_contiguous(), "frequency_penalties must be contiguous.");
+    TORCH_CHECK(frequencyPenalties.dim() == 1, "frequency_penalties must be a 1D tensor.");
+    TORCH_CHECK(frequencyPenalties.scalar_type() == torch::kFloat, "frequency_penalties dtype must be float32.");
+    TORCH_CHECK(rowSlots.size(0) == logits.size(0), "row_slots and logits must have the same row count.");
+    TORCH_CHECK(
+        frequencyPenalties.size(0) == logits.size(0), "frequency_penalties and logits must have the same row count.");
+    TORCH_CHECK(tokenCounts.size(1) == logits.size(1), "token_counts and logits vocab size mismatch.");
+
+    if (logits.size(0) == 0 || logits.size(1) == 0)
+    {
+        return;
+    }
+
+    switch (logits.scalar_type())
+    {
+    case torch::kFloat32:
+        invokeTypedApplyCountFrequencyPenalty<float>(logits, tokenCounts, rowSlots, frequencyPenalties);
+        break;
+    case torch::kFloat16:
+        invokeTypedApplyCountFrequencyPenalty<half>(logits, tokenCounts, rowSlots, frequencyPenalties);
+        break;
+    case torch::kBFloat16:
+        invokeTypedApplyCountFrequencyPenalty<__nv_bfloat16>(logits, tokenCounts, rowSlots, frequencyPenalties);
+        break;
+    default:
+        TORCH_CHECK(false, "logits dtype must be float32, float16, or bfloat16.");
+    }
+}
+
+void speculativeAppendAcceptedTokenCounts(th::Tensor& tokenCounts, th::Tensor const& seqSlots,
+    th::Tensor const& acceptedTokens, th::Tensor const& acceptedLens)
+{
+    TORCH_CHECK(tokenCounts.is_cuda(), "token_counts must be a CUDA tensor.");
+    TORCH_CHECK(tokenCounts.is_contiguous(), "token_counts must be contiguous.");
+    TORCH_CHECK(tokenCounts.dim() == 2, "token_counts must be a 2D tensor.");
+    TORCH_CHECK(tokenCounts.scalar_type() == torch::kInt, "token_counts dtype must be int32.");
+    TORCH_CHECK(seqSlots.is_cuda(), "seq_slots must be a CUDA tensor.");
+    TORCH_CHECK(seqSlots.is_contiguous(), "seq_slots must be contiguous.");
+    TORCH_CHECK(seqSlots.dim() == 1, "seq_slots must be a 1D tensor.");
+    TORCH_CHECK(seqSlots.scalar_type() == torch::kInt, "seq_slots dtype must be int32.");
+    TORCH_CHECK(acceptedTokens.is_cuda(), "accepted_tokens must be a CUDA tensor.");
+    TORCH_CHECK(acceptedTokens.is_contiguous(), "accepted_tokens must be contiguous.");
+    TORCH_CHECK(acceptedTokens.dim() == 2, "accepted_tokens must be a 2D tensor.");
+    TORCH_CHECK(acceptedTokens.scalar_type() == torch::kInt, "accepted_tokens dtype must be int32.");
+    TORCH_CHECK(acceptedLens.is_cuda(), "accepted_lens must be a CUDA tensor.");
+    TORCH_CHECK(acceptedLens.is_contiguous(), "accepted_lens must be contiguous.");
+    TORCH_CHECK(acceptedLens.dim() == 1, "accepted_lens must be a 1D tensor.");
+    TORCH_CHECK(acceptedLens.scalar_type() == torch::kInt, "accepted_lens dtype must be int32.");
+    TORCH_CHECK(seqSlots.size(0) == acceptedTokens.size(0), "seq_slots and accepted_tokens row count mismatch.");
+    TORCH_CHECK(acceptedLens.size(0) == acceptedTokens.size(0), "accepted_lens and accepted_tokens row count mismatch.");
+
+    if (acceptedTokens.size(0) == 0 || acceptedTokens.size(1) == 0 || tokenCounts.size(1) == 0)
+    {
+        return;
+    }
+
+    auto stream = at::cuda::getCurrentCUDAStream(tokenCounts.get_device()).stream();
+    tk::invokeAppendSpeculativeAcceptedTokenCounts(reinterpret_cast<int32_t*>(tokenCounts.data_ptr()),
+        reinterpret_cast<int32_t const*>(seqSlots.data_ptr()), reinterpret_cast<int32_t const*>(acceptedTokens.data_ptr()),
+        reinterpret_cast<int32_t const*>(acceptedLens.data_ptr()), static_cast<int32_t>(acceptedTokens.size(0)),
+        static_cast<int32_t>(acceptedTokens.size(1)), static_cast<int32_t>(tokenCounts.size(1)), stream);
+}
+
+void speculativeApplySparseCountFrequencyPenalty(th::Tensor& logits, th::Tensor const& tokenIds,
+    th::Tensor const& tokenCounts, th::Tensor const& countLens, th::Tensor const& rowSlots,
+    th::Tensor const& frequencyPenalties)
+{
+    TORCH_CHECK(logits.is_cuda(), "logits must be a CUDA tensor.");
+    TORCH_CHECK(logits.is_contiguous(), "logits must be contiguous.");
+    TORCH_CHECK(logits.dim() == 2, "logits must be a 2D tensor.");
+    TORCH_CHECK(tokenIds.is_cuda(), "token_ids must be a CUDA tensor.");
+    TORCH_CHECK(tokenIds.is_contiguous(), "token_ids must be contiguous.");
+    TORCH_CHECK(tokenIds.dim() == 2, "token_ids must be a 2D tensor.");
+    TORCH_CHECK(tokenIds.scalar_type() == torch::kInt, "token_ids dtype must be int32.");
+    TORCH_CHECK(tokenCounts.is_cuda(), "token_counts must be a CUDA tensor.");
+    TORCH_CHECK(tokenCounts.is_contiguous(), "token_counts must be contiguous.");
+    TORCH_CHECK(tokenCounts.dim() == 2, "token_counts must be a 2D tensor.");
+    TORCH_CHECK(tokenCounts.scalar_type() == torch::kInt, "token_counts dtype must be int32.");
+    TORCH_CHECK(countLens.is_cuda(), "count_lens must be a CUDA tensor.");
+    TORCH_CHECK(countLens.is_contiguous(), "count_lens must be contiguous.");
+    TORCH_CHECK(countLens.dim() == 1, "count_lens must be a 1D tensor.");
+    TORCH_CHECK(countLens.scalar_type() == torch::kInt, "count_lens dtype must be int32.");
+    TORCH_CHECK(rowSlots.is_cuda(), "row_slots must be a CUDA tensor.");
+    TORCH_CHECK(rowSlots.is_contiguous(), "row_slots must be contiguous.");
+    TORCH_CHECK(rowSlots.dim() == 1, "row_slots must be a 1D tensor.");
+    TORCH_CHECK(rowSlots.scalar_type() == torch::kInt, "row_slots dtype must be int32.");
+    TORCH_CHECK(frequencyPenalties.is_cuda(), "frequency_penalties must be a CUDA tensor.");
+    TORCH_CHECK(frequencyPenalties.is_contiguous(), "frequency_penalties must be contiguous.");
+    TORCH_CHECK(frequencyPenalties.dim() == 1, "frequency_penalties must be a 1D tensor.");
+    TORCH_CHECK(frequencyPenalties.scalar_type() == torch::kFloat, "frequency_penalties dtype must be float32.");
+    TORCH_CHECK(tokenIds.size(0) == tokenCounts.size(0), "token_ids and token_counts slot count mismatch.");
+    TORCH_CHECK(tokenIds.size(1) == tokenCounts.size(1), "token_ids and token_counts capacity mismatch.");
+    TORCH_CHECK(countLens.size(0) == tokenIds.size(0), "count_lens and token_ids slot count mismatch.");
+    TORCH_CHECK(rowSlots.size(0) == logits.size(0), "row_slots and logits must have the same row count.");
+    TORCH_CHECK(
+        frequencyPenalties.size(0) == logits.size(0), "frequency_penalties and logits must have the same row count.");
+
+    if (logits.size(0) == 0 || tokenIds.size(1) == 0 || logits.size(1) == 0)
+    {
+        return;
+    }
+
+    switch (logits.scalar_type())
+    {
+    case torch::kFloat32:
+        invokeTypedApplySparseCountFrequencyPenalty<float>(
+            logits, tokenIds, tokenCounts, countLens, rowSlots, frequencyPenalties);
+        break;
+    case torch::kFloat16:
+        invokeTypedApplySparseCountFrequencyPenalty<half>(
+            logits, tokenIds, tokenCounts, countLens, rowSlots, frequencyPenalties);
+        break;
+    case torch::kBFloat16:
+        invokeTypedApplySparseCountFrequencyPenalty<__nv_bfloat16>(
+            logits, tokenIds, tokenCounts, countLens, rowSlots, frequencyPenalties);
+        break;
+    default:
+        TORCH_CHECK(false, "logits dtype must be float32, float16, or bfloat16.");
+    }
+}
+
+void speculativeAppendSparseTokenCounts(th::Tensor& tokenIds, th::Tensor& tokenCounts, th::Tensor& countLens,
+    th::Tensor const& seqSlots, th::Tensor const& acceptedTokens, th::Tensor const& acceptedLens, int64_t vocabSize)
+{
+    TORCH_CHECK(tokenIds.is_cuda(), "token_ids must be a CUDA tensor.");
+    TORCH_CHECK(tokenIds.is_contiguous(), "token_ids must be contiguous.");
+    TORCH_CHECK(tokenIds.dim() == 2, "token_ids must be a 2D tensor.");
+    TORCH_CHECK(tokenIds.scalar_type() == torch::kInt, "token_ids dtype must be int32.");
+    TORCH_CHECK(tokenCounts.is_cuda(), "token_counts must be a CUDA tensor.");
+    TORCH_CHECK(tokenCounts.is_contiguous(), "token_counts must be contiguous.");
+    TORCH_CHECK(tokenCounts.dim() == 2, "token_counts must be a 2D tensor.");
+    TORCH_CHECK(tokenCounts.scalar_type() == torch::kInt, "token_counts dtype must be int32.");
+    TORCH_CHECK(countLens.is_cuda(), "count_lens must be a CUDA tensor.");
+    TORCH_CHECK(countLens.is_contiguous(), "count_lens must be contiguous.");
+    TORCH_CHECK(countLens.dim() == 1, "count_lens must be a 1D tensor.");
+    TORCH_CHECK(countLens.scalar_type() == torch::kInt, "count_lens dtype must be int32.");
+    TORCH_CHECK(seqSlots.is_cuda(), "seq_slots must be a CUDA tensor.");
+    TORCH_CHECK(seqSlots.is_contiguous(), "seq_slots must be contiguous.");
+    TORCH_CHECK(seqSlots.dim() == 1, "seq_slots must be a 1D tensor.");
+    TORCH_CHECK(seqSlots.scalar_type() == torch::kInt, "seq_slots dtype must be int32.");
+    TORCH_CHECK(acceptedTokens.is_cuda(), "accepted_tokens must be a CUDA tensor.");
+    TORCH_CHECK(acceptedTokens.is_contiguous(), "accepted_tokens must be contiguous.");
+    TORCH_CHECK(acceptedTokens.dim() == 2, "accepted_tokens must be a 2D tensor.");
+    TORCH_CHECK(acceptedTokens.scalar_type() == torch::kInt, "accepted_tokens dtype must be int32.");
+    TORCH_CHECK(acceptedLens.is_cuda(), "accepted_lens must be a CUDA tensor.");
+    TORCH_CHECK(acceptedLens.is_contiguous(), "accepted_lens must be contiguous.");
+    TORCH_CHECK(acceptedLens.dim() == 1, "accepted_lens must be a 1D tensor.");
+    TORCH_CHECK(acceptedLens.scalar_type() == torch::kInt, "accepted_lens dtype must be int32.");
+    TORCH_CHECK(tokenIds.size(0) == tokenCounts.size(0), "token_ids and token_counts slot count mismatch.");
+    TORCH_CHECK(tokenIds.size(1) == tokenCounts.size(1), "token_ids and token_counts capacity mismatch.");
+    TORCH_CHECK(countLens.size(0) == tokenIds.size(0), "count_lens and token_ids slot count mismatch.");
+    TORCH_CHECK(seqSlots.size(0) == acceptedTokens.size(0), "seq_slots and accepted_tokens row count mismatch.");
+    TORCH_CHECK(acceptedLens.size(0) == acceptedTokens.size(0), "accepted_lens and accepted_tokens row count mismatch.");
+    TORCH_CHECK(vocabSize > 0, "vocab_size must be positive.");
+
+    if (acceptedTokens.size(0) == 0 || acceptedTokens.size(1) == 0 || tokenIds.size(1) == 0)
+    {
+        return;
+    }
+
+    auto stream = at::cuda::getCurrentCUDAStream(tokenIds.get_device()).stream();
+    tk::invokeAppendSpeculativeSparseTokenCounts(reinterpret_cast<int32_t*>(tokenIds.data_ptr()),
+        reinterpret_cast<int32_t*>(tokenCounts.data_ptr()), reinterpret_cast<int32_t*>(countLens.data_ptr()),
+        reinterpret_cast<int32_t const*>(seqSlots.data_ptr()), reinterpret_cast<int32_t const*>(acceptedTokens.data_ptr()),
+        reinterpret_cast<int32_t const*>(acceptedLens.data_ptr()), static_cast<int32_t>(acceptedTokens.size(0)),
+        static_cast<int32_t>(acceptedTokens.size(1)), static_cast<int32_t>(tokenIds.size(1)),
+        static_cast<int32_t>(vocabSize), stream);
+}
+
+void speculativeInitSparseTokenCounts(th::Tensor& tokenIds, th::Tensor& tokenCounts, th::Tensor& countLens,
+    th::Tensor const& promptTokenIds, th::Tensor const& promptTokenCounts, th::Tensor const& promptLens,
+    th::Tensor const& seqSlots, int64_t vocabSize)
+{
+    TORCH_CHECK(tokenIds.is_cuda(), "token_ids must be a CUDA tensor.");
+    TORCH_CHECK(tokenIds.is_contiguous(), "token_ids must be contiguous.");
+    TORCH_CHECK(tokenIds.dim() == 2, "token_ids must be a 2D tensor.");
+    TORCH_CHECK(tokenIds.scalar_type() == torch::kInt, "token_ids dtype must be int32.");
+    TORCH_CHECK(tokenCounts.is_cuda(), "token_counts must be a CUDA tensor.");
+    TORCH_CHECK(tokenCounts.is_contiguous(), "token_counts must be contiguous.");
+    TORCH_CHECK(tokenCounts.dim() == 2, "token_counts must be a 2D tensor.");
+    TORCH_CHECK(tokenCounts.scalar_type() == torch::kInt, "token_counts dtype must be int32.");
+    TORCH_CHECK(countLens.is_cuda(), "count_lens must be a CUDA tensor.");
+    TORCH_CHECK(countLens.is_contiguous(), "count_lens must be contiguous.");
+    TORCH_CHECK(countLens.dim() == 1, "count_lens must be a 1D tensor.");
+    TORCH_CHECK(countLens.scalar_type() == torch::kInt, "count_lens dtype must be int32.");
+    TORCH_CHECK(promptTokenIds.is_cuda(), "prompt_token_ids must be a CUDA tensor.");
+    TORCH_CHECK(promptTokenIds.is_contiguous(), "prompt_token_ids must be contiguous.");
+    TORCH_CHECK(promptTokenIds.dim() == 2, "prompt_token_ids must be a 2D tensor.");
+    TORCH_CHECK(promptTokenIds.scalar_type() == torch::kInt, "prompt_token_ids dtype must be int32.");
+    TORCH_CHECK(promptTokenCounts.is_cuda(), "prompt_token_counts must be a CUDA tensor.");
+    TORCH_CHECK(promptTokenCounts.is_contiguous(), "prompt_token_counts must be contiguous.");
+    TORCH_CHECK(promptTokenCounts.dim() == 2, "prompt_token_counts must be a 2D tensor.");
+    TORCH_CHECK(promptTokenCounts.scalar_type() == torch::kInt, "prompt_token_counts dtype must be int32.");
+    TORCH_CHECK(promptLens.is_cuda(), "prompt_lens must be a CUDA tensor.");
+    TORCH_CHECK(promptLens.is_contiguous(), "prompt_lens must be contiguous.");
+    TORCH_CHECK(promptLens.dim() == 1, "prompt_lens must be a 1D tensor.");
+    TORCH_CHECK(promptLens.scalar_type() == torch::kInt, "prompt_lens dtype must be int32.");
+    TORCH_CHECK(seqSlots.is_cuda(), "seq_slots must be a CUDA tensor.");
+    TORCH_CHECK(seqSlots.is_contiguous(), "seq_slots must be contiguous.");
+    TORCH_CHECK(seqSlots.dim() == 1, "seq_slots must be a 1D tensor.");
+    TORCH_CHECK(seqSlots.scalar_type() == torch::kInt, "seq_slots dtype must be int32.");
+    TORCH_CHECK(tokenIds.size(0) == tokenCounts.size(0), "token_ids and token_counts slot count mismatch.");
+    TORCH_CHECK(tokenIds.size(1) == tokenCounts.size(1), "token_ids and token_counts capacity mismatch.");
+    TORCH_CHECK(countLens.size(0) == tokenIds.size(0), "count_lens and token_ids slot count mismatch.");
+    TORCH_CHECK(promptTokenIds.size(0) == promptTokenCounts.size(0),
+        "prompt_token_ids and prompt_token_counts row count mismatch.");
+    TORCH_CHECK(promptTokenIds.size(1) == promptTokenCounts.size(1),
+        "prompt_token_ids and prompt_token_counts capacity mismatch.");
+    TORCH_CHECK(promptLens.size(0) == promptTokenIds.size(0), "prompt_lens and prompt_token_ids row count mismatch.");
+    TORCH_CHECK(seqSlots.size(0) == promptTokenIds.size(0), "seq_slots and prompt_token_ids row count mismatch.");
+    TORCH_CHECK(vocabSize > 0, "vocab_size must be positive.");
+
+    if (promptTokenIds.size(0) == 0 || promptTokenIds.size(1) == 0 || tokenIds.size(1) == 0)
+    {
+        return;
+    }
+
+    auto stream = at::cuda::getCurrentCUDAStream(tokenIds.get_device()).stream();
+    tk::invokeInitSpeculativeSparseTokenCounts(reinterpret_cast<int32_t*>(tokenIds.data_ptr()),
+        reinterpret_cast<int32_t*>(tokenCounts.data_ptr()), reinterpret_cast<int32_t*>(countLens.data_ptr()),
+        reinterpret_cast<int32_t const*>(promptTokenIds.data_ptr()),
+        reinterpret_cast<int32_t const*>(promptTokenCounts.data_ptr()),
+        reinterpret_cast<int32_t const*>(promptLens.data_ptr()), reinterpret_cast<int32_t const*>(seqSlots.data_ptr()),
+        static_cast<int32_t>(promptTokenIds.size(0)), static_cast<int32_t>(promptTokenIds.size(1)),
+        static_cast<int32_t>(tokenIds.size(1)), static_cast<int32_t>(vocabSize), stream);
+}
+
+} // namespace torch_ext
+
+TRTLLM_NAMESPACE_END
+
+TORCH_LIBRARY_FRAGMENT(trtllm, m)
+{
+    m.def("speculative_apply_token_penalties(Tensor(a!) logits, Tensor token_ids, Tensor penalty_values) -> ()");
+    m.def(
+        "speculative_apply_history_frequency_penalty(Tensor(a!) logits, Tensor history_tokens, Tensor history_lens, Tensor row_slots, Tensor frequency_penalties) -> ()");
+    m.def(
+        "speculative_append_accepted_tokens(Tensor(a!) history_tokens, Tensor(b!) history_lens, Tensor seq_slots, Tensor accepted_tokens, Tensor accepted_lens) -> ()");
+    m.def(
+        "speculative_apply_count_frequency_penalty(Tensor(a!) logits, Tensor token_counts, Tensor row_slots, Tensor frequency_penalties) -> ()");
+    m.def(
+        "speculative_append_accepted_token_counts(Tensor(a!) token_counts, Tensor seq_slots, Tensor accepted_tokens, Tensor accepted_lens) -> ()");
+    m.def(
+        "speculative_apply_sparse_count_frequency_penalty(Tensor(a!) logits, Tensor token_ids, Tensor token_counts, Tensor count_lens, Tensor row_slots, Tensor frequency_penalties) -> ()");
+    m.def(
+        "speculative_append_sparse_token_counts(Tensor(a!) token_ids, Tensor(b!) token_counts, Tensor(c!) count_lens, Tensor seq_slots, Tensor accepted_tokens, Tensor accepted_lens, int vocab_size) -> ()");
+    m.def(
+        "speculative_init_sparse_token_counts(Tensor(a!) token_ids, Tensor(b!) token_counts, Tensor(c!) count_lens, Tensor prompt_token_ids, Tensor prompt_token_counts, Tensor prompt_lens, Tensor seq_slots, int vocab_size) -> ()");
+}
+
+TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
+{
+    m.impl("speculative_apply_token_penalties", &tensorrt_llm::torch_ext::speculativeApplyTokenPenalties);
+    m.impl("speculative_apply_history_frequency_penalty",
+        &tensorrt_llm::torch_ext::speculativeApplyHistoryFrequencyPenalty);
+    m.impl("speculative_append_accepted_tokens", &tensorrt_llm::torch_ext::speculativeAppendAcceptedTokens);
+    m.impl("speculative_apply_count_frequency_penalty",
+        &tensorrt_llm::torch_ext::speculativeApplyCountFrequencyPenalty);
+    m.impl("speculative_append_accepted_token_counts",
+        &tensorrt_llm::torch_ext::speculativeAppendAcceptedTokenCounts);
+    m.impl("speculative_apply_sparse_count_frequency_penalty",
+        &tensorrt_llm::torch_ext::speculativeApplySparseCountFrequencyPenalty);
+    m.impl("speculative_append_sparse_token_counts",
+        &tensorrt_llm::torch_ext::speculativeAppendSparseTokenCounts);
+    m.impl("speculative_init_sparse_token_counts", &tensorrt_llm::torch_ext::speculativeInitSparseTokenCounts);
+}
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index a32734bd599..cea5691e222 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -332,6 +332,9 @@ def __init__(
 
         self.is_warmup = False
         self.previous_request_ids = []
+        self.previous_device_sampled_request_ids: set[int] = set()
+        self.debug_spec_device_draft_guard = (
+            os.environ.get("TRTLLM_SPEC_DRAFT_GUARD_DEBUG", "0") == "1")
         self.has_previous_device_draft = False
         self.previous_accepted_tokens_cuda = torch.empty((self.batch_size, ),
                                                          dtype=torch.int,
@@ -1410,7 +1413,8 @@ def _set_up_spec_metadata(
                 max_num_tokens=self.max_num_tokens,
                 spec_resource_manager=spec_resource_manager,
                 is_draft_model=self.is_draft_model,
-                max_seq_len=self.max_seq_len)
+                max_seq_len=self.max_seq_len,
+                max_num_sequence_slots=self.get_max_num_sequences())
 
         if self.spec_metadata is not None:
             return self.spec_metadata
@@ -1421,7 +1425,8 @@ def _set_up_spec_metadata(
             max_num_tokens=self.max_num_tokens,
             spec_resource_manager=spec_resource_manager,
             is_draft_model=self.is_draft_model,
-            max_seq_len=self.max_seq_len)
+            max_seq_len=self.max_seq_len,
+            max_num_sequence_slots=self.get_max_num_sequences())
         return self.spec_metadata
 
     def __del__(self) -> None:
@@ -2374,13 +2379,27 @@ def _prepare_tp_inputs(
         extend_dummy_requests = []
         generation_requests = []
         first_draft_requests = []
+        previous_device_sampled_request_ids = self.previous_device_sampled_request_ids
         # Collect generation request IDs during categorization to avoid
         # a separate iteration over scheduled_requests.generation_requests later.
         all_gen_request_ids = []
         for request in scheduled_requests.generation_requests:
             all_gen_request_ids.append(request.py_request_id)
-            if get_draft_token_length(
-                    request) > 0 or next_draft_tokens_device is not None:
+            has_previous_device_draft = (
+                next_draft_tokens_device is not None
+                and request.py_batch_idx is not None
+                and request.py_request_id in previous_device_sampled_request_ids)
+            if (self.debug_spec_device_draft_guard
+                    and next_draft_tokens_device is not None
+                    and request.py_batch_idx is not None
+                    and request.py_request_id
+                    not in previous_device_sampled_request_ids):
+                logger.info(
+                    "Ignoring stale speculative device draft for request_id=%s "
+                    "prev_seq_slot=%s current_seq_slot=%s",
+                    request.py_request_id, request.py_batch_idx,
+                    request.py_seq_slot)
+            if get_draft_token_length(request) > 0 or has_previous_device_draft:
                 if request.is_dummy:
                     extend_dummy_requests.append(request)
                 else:
@@ -2416,7 +2435,11 @@ def _prepare_tp_inputs(
             # (1) next_draft_tokens_device is None, which means overlap scheduler is disabled; or
             # (2) a dummy request; or
             # (3) the first step in the generation server of disaggregated serving
-            if next_draft_tokens_device is None or request.is_dummy or request.py_batch_idx is None:
+            has_previous_device_draft = (
+                next_draft_tokens_device is not None
+                and request.py_batch_idx is not None
+                and request.py_request_id in previous_device_sampled_request_ids)
+            if not has_previous_device_draft or request.is_dummy:
                 # get token ids, including input token ids and draft token ids. For these dummy requests,
                 # no need to copy the token ids.
                 if not (request.is_attention_dp_dummy
@@ -3069,6 +3092,9 @@ def previous_seq_slots_device():
 
         if spec_metadata is not None:
             total_draft_lens = sum(draft_lens)
+            spec_sampling_requests = (
+                scheduled_requests.context_requests + extend_requests +
+                first_draft_requests + generation_requests)
             spec_metadata.draft_tokens = self.draft_tokens_cuda[:
                                                                 total_draft_lens]
             spec_metadata.request_ids = request_ids
@@ -3077,13 +3103,23 @@ def previous_seq_slots_device():
                 scheduled_requests.generation_requests)
             spec_metadata.num_tokens = total_num_tokens
             spec_metadata.seq_lens = sequence_lengths
+            spec_metadata.sampling_request_ids = [
+                int(request.py_request_id) for request in spec_sampling_requests
+            ]
+            spec_metadata.sampling_seq_slots = [
+                int(request.py_seq_slot)
+                if request.py_seq_slot is not None else -1
+                for request in spec_sampling_requests
+            ]
             spec_metadata.num_accepted_draft_tokens = self.num_accepted_draft_tokens_cuda[:len(
                 num_accepted_draft_tokens)]
             if isinstance(spec_metadata, Eagle3SpecMetadata):
                 spec_metadata.request_accepted_path = request_accepted_path
             # No-op for non 1-model
             spec_metadata.populate_sampling_params_for_one_model(
-                scheduled_requests.all_requests())
+                spec_sampling_requests)
+            spec_metadata.prepare_device_penalty_counts(
+                int(self.model.config.vocab_size))
             spec_metadata.prepare()
             inputs['spec_metadata'] = spec_metadata
 
@@ -3116,6 +3152,13 @@ def previous_seq_slots_device():
 
         if not self.is_warmup:
             self.previous_request_ids = all_gen_request_ids
+            self.previous_device_sampled_request_ids = {
+                request.py_request_id
+                for request in (
+                    scheduled_requests.context_requests_last_chunk +
+                    scheduled_requests.generation_requests)
+                if not request.is_dummy
+            }
             self.has_previous_device_draft = next_draft_tokens_device is not None
 
         return inputs, self.gather_ids_cuda[:len(
@@ -3871,9 +3914,68 @@ def capture_postprocess_fn(inputs: Dict[str, Any]):
                 self.forward_pass_callable()
 
             self._execute_logit_post_processors(scheduled_requests, outputs)
+            self._attach_spec_penalty_outputs(outputs,
+                                              inputs.get("spec_metadata"))
 
             return outputs
 
+    @staticmethod
+    def _attach_spec_penalty_outputs(outputs: Dict[str, Any],
+                                     spec_metadata: Any) -> None:
+        if not isinstance(outputs, dict) or spec_metadata is None:
+            return
+
+        sampling_request_ids = getattr(spec_metadata, "sampling_request_ids",
+                                       None)
+        if sampling_request_ids is not None:
+            outputs["penalty_sampling_request_ids"] = sampling_request_ids
+        sampling_seq_slots = getattr(spec_metadata, "sampling_seq_slots", None)
+        if sampling_seq_slots is not None:
+            outputs["penalty_sampling_seq_slots"] = sampling_seq_slots
+
+        if getattr(spec_metadata, "use_device_penalty_counts", False):
+            count_seq_slots = getattr(spec_metadata,
+                                      "device_penalty_count_seq_slots", None)
+            if count_seq_slots is not None:
+                outputs["penalty_count_seq_slots"] = count_seq_slots
+            count_mode = getattr(spec_metadata, "device_penalty_count_mode",
+                                 "")
+            if (count_mode == "dense"
+                    and getattr(spec_metadata, "device_penalty_token_counts",
+                                None) is not None):
+                outputs[
+                    "penalty_token_counts"] = spec_metadata.device_penalty_token_counts
+            elif (count_mode == "sparse"
+                  and getattr(spec_metadata, "device_penalty_sparse_token_ids",
+                              None) is not None
+                  and getattr(spec_metadata,
+                              "device_penalty_sparse_token_counts",
+                              None) is not None
+                  and getattr(spec_metadata, "device_penalty_sparse_count_lens",
+                              None) is not None):
+                outputs[
+                    "penalty_sparse_token_ids"] = spec_metadata.device_penalty_sparse_token_ids
+                outputs[
+                    "penalty_sparse_token_counts"] = spec_metadata.device_penalty_sparse_token_counts
+                outputs[
+                    "penalty_sparse_count_lens"] = spec_metadata.device_penalty_sparse_count_lens
+                outputs["penalty_count_vocab_size"] = getattr(
+                    spec_metadata, "device_penalty_count_vocab_size", 0)
+
+        if (getattr(spec_metadata, "use_device_penalty_history", False)
+                and getattr(spec_metadata, "device_penalty_history_tokens",
+                            None) is not None
+                and getattr(spec_metadata, "device_penalty_history_lens",
+                            None) is not None):
+            history_seq_slots = getattr(spec_metadata,
+                                        "device_penalty_seq_slots", None)
+            if history_seq_slots is not None:
+                outputs["penalty_history_seq_slots"] = history_seq_slots
+            outputs[
+                "penalty_history_tokens"] = spec_metadata.device_penalty_history_tokens
+            outputs[
+                "penalty_history_lens"] = spec_metadata.device_penalty_history_lens
+
     def model_forward(self, **kwargs):
         attrs = get_model_extra_attrs()
         assert attrs is not None, "Model extra attrs is not set"
diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
index f4dad9d8f4e..d7094b5c8d2 100644
--- a/tensorrt_llm/_torch/speculative/interface.py
+++ b/tensorrt_llm/_torch/speculative/interface.py
@@ -1,5 +1,6 @@
 import copy
 import os
+from collections import Counter
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from dataclasses import dataclass, field
@@ -20,6 +21,7 @@
 
 if TYPE_CHECKING:
     from ..pyexecutor.guided_decoder import CapturableGuidedDecoder
+    from ..pyexecutor.llm_request import LlmRequest
 
 if IS_FLASHINFER_AVAILABLE:
     import flashinfer
@@ -328,6 +330,9 @@ class SpecMetadata:
     max_draft_len: int
     # The max number of draft tokens for the static tree and dynamic tree   .
     max_total_draft_tokens: int
+    # Capacity for persistent sequence-slot indexed state. This can be larger
+    # than max_num_requests when the executor has multiple sequence slots.
+    max_num_sequence_slots: Optional[int] = None
     # The number of gen-phase sequences in the batch.
     num_generations: int = 0
     # Whether CUDA graph is enabled.
@@ -384,6 +389,73 @@ class SpecMetadata:
     temperatures: Optional[torch.Tensor] = None
     top_ks: Optional[torch.Tensor] = None
     top_ps: Optional[torch.Tensor] = None
+    recent_penalty_token_ids: Optional[torch.Tensor] = field(default=None,
+                                                             repr=False)
+    recent_penalty_values: Optional[torch.Tensor] = field(default=None,
+                                                          repr=False)
+    recent_seq_penalty_token_ids: Optional[torch.Tensor] = field(default=None,
+                                                                 repr=False)
+    recent_seq_penalty_values: Optional[torch.Tensor] = field(default=None,
+                                                              repr=False)
+    draft_prefix_penalty_token_ids: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    draft_prefix_penalty_values: Optional[torch.Tensor] = field(default=None,
+                                                                repr=False)
+    draft_prefix_penalty_rows: Optional[torch.Tensor] = field(default=None,
+                                                              repr=False)
+    device_penalty_history_tokens: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_penalty_history_lens: Optional[torch.Tensor] = field(default=None,
+                                                                repr=False)
+    device_penalty_row_slots: Optional[torch.Tensor] = field(default=None,
+                                                             repr=False)
+    device_penalty_seq_slots: Optional[torch.Tensor] = field(default=None,
+                                                             repr=False)
+    device_frequency_penalties: Optional[torch.Tensor] = field(default=None,
+                                                               repr=False)
+    device_seq_frequency_penalties: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_penalty_history_capacity: int = 0
+    use_device_penalty_history: bool = False
+    device_penalty_token_counts: Optional[torch.Tensor] = field(default=None,
+                                                                repr=False)
+    device_penalty_sparse_token_ids: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_penalty_sparse_token_counts: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_penalty_sparse_count_lens: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_penalty_count_row_slots: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_penalty_count_seq_slots: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_count_frequency_penalties: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_count_seq_frequency_penalties: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_penalty_count_reset_slots: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_penalty_count_reset_count: int = 0
+    device_penalty_count_prompt_tokens: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_penalty_count_prompt_token_counts: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_penalty_count_prompt_lens: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_penalty_count_prompt_seq_slots: Optional[torch.Tensor] = field(
+        default=None, repr=False)
+    device_penalty_count_prompt_count: int = 0
+    device_penalty_count_prompt_capacity: int = 0
+    device_penalty_sparse_count_capacity: int = 0
+    device_penalty_count_vocab_size: int = 0
+    device_penalty_count_mode: str = "dense"
+    use_device_penalty_counts: bool = False
+    device_penalty_count_slot_request_ids: dict[int, int] = field(
+        default_factory=dict, repr=False)
+    cuda_graph_source_metadata: Optional[object] = field(default=None,
+                                                         repr=False)
+    sampling_request_ids: Optional[list[int]] = field(default=None, repr=False)
+    sampling_seq_slots: Optional[list[int]] = field(default=None, repr=False)
 
     def __post_init__(self):
         pass
@@ -403,9 +475,30 @@ def create_cuda_graph_metadata(self, max_batch_size: int):
         cuda_graph_metadata = copy.copy(self)
         cuda_graph_metadata.is_cuda_graph = True
         cuda_graph_metadata.max_num_requests = max_batch_size
+        cuda_graph_metadata.cuda_graph_source_metadata = self
+        cuda_graph_metadata.device_penalty_count_slot_request_ids = (
+            self.device_penalty_count_slot_request_ids)
+        cuda_graph_metadata._sync_device_penalty_count_state_from_owner()
         cuda_graph_metadata.__post_init__()
         return cuda_graph_metadata
 
+    def _device_penalty_count_state_owner(self):
+        return self.cuda_graph_source_metadata or self
+
+    def _sync_device_penalty_count_state_from_owner(self) -> None:
+        owner = self._device_penalty_count_state_owner()
+        if owner is self:
+            return
+        for name in (
+                "device_penalty_token_counts",
+                "device_penalty_sparse_token_ids",
+                "device_penalty_sparse_token_counts",
+                "device_penalty_sparse_count_lens",
+                "device_penalty_sparse_count_capacity",
+                "device_penalty_count_vocab_size",
+        ):
+            setattr(self, name, getattr(owner, name))
+
     def is_layer_capture(self, layer_id: int):
         """
         Whether the layer should be captured (eg for Eagle3).
@@ -421,6 +514,850 @@ def maybe_capture_hidden_states(self, layer_id: int,
         model. Use this method to record them. By default, does nothing.
         """
 
+    @staticmethod
+    def _sampling_config_value(config, name: str, default):
+        value = getattr(config, name, None)
+        if value is None:
+            return default
+        if isinstance(value, torch.Tensor):
+            if value.numel() == 0:
+                return default
+            return value.flatten()[0].item()
+        if isinstance(value, (list, tuple)):
+            if len(value) == 0:
+                return default
+            value = value[0]
+        return default if value is None else value
+
+    @staticmethod
+    def _effective_prompt_ignore_length(request: "LlmRequest",
+                                        prompt_ignore_length: int) -> int:
+        prompt_len = getattr(request, "py_orig_prompt_len", None)
+        if prompt_len is None:
+            prompt_len = getattr(request, "orig_prompt_len", None)
+        if prompt_len is None:
+            prompt_len = getattr(request, "py_prompt_len", None)
+        if prompt_len is None:
+            prompt_len = getattr(request, "prompt_len", 0)
+        return min(max(prompt_ignore_length, 0), max(int(prompt_len), 0))
+
+    @staticmethod
+    def _prompt_len(request: "LlmRequest") -> int:
+        for attr in ("py_orig_prompt_len", "orig_prompt_len", "py_prompt_len",
+                     "prompt_len"):
+            value = getattr(request, attr, None)
+            if value is not None:
+                return max(int(value), 0)
+        return 0
+
+    def _valid_seq_slot(self, slot: int) -> bool:
+        return 0 <= slot < self._max_num_sequence_slots()
+
+    def _max_num_sequence_slots(self) -> int:
+        max_num_sequence_slots = self.max_num_sequence_slots
+        if max_num_sequence_slots is None or max_num_sequence_slots <= 0:
+            return self.max_num_requests
+        return max(int(max_num_sequence_slots), self.max_num_requests)
+
+    @staticmethod
+    def _env_bool(value: Optional[str]) -> Optional[bool]:
+        if value is None:
+            return None
+        normalized = value.strip().lower()
+        if normalized in ("", "auto"):
+            return None
+        if normalized in ("1", "true", "yes", "on"):
+            return True
+        if normalized in ("0", "false", "no", "off"):
+            return False
+        return None
+
+    @staticmethod
+    def _is_disagg_generation_role() -> bool:
+        role = os.environ.get("TRTLLM_DISAGG_ROLE", "").strip().lower()
+        if role in ("generation", "gen", "decode"):
+            return True
+        return os.environ.get("TRTLLM_DISAGG_BENCHMARK_GEN_ONLY") == "1"
+
+    def _force_graph_count_path_enabled(self) -> bool:
+        override = self._env_bool(
+            os.environ.get("TRTLLM_SPEC_FORCE_GRAPH_COUNT_PATH"))
+        if override is not None:
+            return self.is_cuda_graph and override
+        return self.is_cuda_graph and self._is_disagg_generation_role()
+
+    def _ensure_recent_penalty_buffers(self, width: int) -> None:
+        max_rows = (self.max_draft_len + 1) * self.max_num_requests
+        max_seqs = self.max_num_requests
+        needs_alloc = (
+            self.recent_penalty_token_ids is None
+            or self.recent_penalty_values is None
+            or self.recent_seq_penalty_token_ids is None
+            or self.recent_seq_penalty_values is None
+            or self.recent_penalty_token_ids.shape != (max_rows, width)
+            or self.recent_seq_penalty_token_ids.shape != (max_seqs, width))
+        if not needs_alloc:
+            return
+
+        self.recent_penalty_token_ids = torch.zeros((max_rows, width),
+                                                    dtype=torch.long,
+                                                    device="cuda")
+        self.recent_penalty_values = torch.zeros((max_rows, width),
+                                                 dtype=torch.float32,
+                                                 device="cuda")
+        self.recent_seq_penalty_token_ids = torch.zeros((max_seqs, width),
+                                                        dtype=torch.long,
+                                                        device="cuda")
+        self.recent_seq_penalty_values = torch.zeros((max_seqs, width),
+                                                     dtype=torch.float32,
+                                                     device="cuda")
+
+    def _ensure_draft_prefix_penalty_buffers(self, width: int) -> None:
+        max_rows = (self.max_draft_len + 1) * self.max_num_requests
+        needs_alloc = (
+            self.draft_prefix_penalty_token_ids is None
+            or self.draft_prefix_penalty_values is None
+            or self.draft_prefix_penalty_rows is None
+            or self.draft_prefix_penalty_token_ids.shape != (max_rows, width))
+        if not needs_alloc:
+            return
+
+        self.draft_prefix_penalty_token_ids = torch.zeros((max_rows, width),
+                                                          dtype=torch.long,
+                                                          device="cuda")
+        self.draft_prefix_penalty_values = torch.zeros((max_rows, width),
+                                                       dtype=torch.float32,
+                                                       device="cuda")
+        self.draft_prefix_penalty_rows = torch.arange(max_rows,
+                                                      dtype=torch.long,
+                                                      device="cuda")
+
+    def _ensure_device_penalty_history_buffers(self) -> None:
+        max_rows = (self.max_draft_len + 1) * self.max_num_requests
+        slot_capacity = self._max_num_sequence_slots()
+        capacity = int(
+            os.environ.get("TRTLLM_SPEC_PENALTY_HISTORY_TOKENS", "16384"))
+        capacity = max(capacity, 0)
+        if capacity == 0:
+            self.use_device_penalty_history = False
+            return
+
+        needs_alloc = (
+            self.device_penalty_history_tokens is None
+            or self.device_penalty_history_lens is None
+            or self.device_penalty_row_slots is None
+            or self.device_penalty_seq_slots is None
+            or self.device_frequency_penalties is None
+            or self.device_seq_frequency_penalties is None
+            or self.device_penalty_history_tokens.shape !=
+            (slot_capacity, capacity)
+            or self.device_penalty_row_slots.shape != (max_rows, ))
+        if not needs_alloc:
+            return
+
+        self.device_penalty_history_capacity = capacity
+        self.device_penalty_history_tokens = torch.zeros(
+            (slot_capacity, capacity), dtype=torch.int32, device="cuda")
+        self.device_penalty_history_lens = torch.zeros(
+            (slot_capacity, ), dtype=torch.int32, device="cuda")
+        self.device_penalty_row_slots = torch.zeros((max_rows, ),
+                                                    dtype=torch.int32,
+                                                    device="cuda")
+        self.device_penalty_seq_slots = torch.zeros((self.max_num_requests, ),
+                                                    dtype=torch.int32,
+                                                    device="cuda")
+        self.device_frequency_penalties = torch.zeros((max_rows, ),
+                                                      dtype=torch.float32,
+                                                      device="cuda")
+        self.device_seq_frequency_penalties = torch.zeros(
+            (self.max_num_requests, ), dtype=torch.float32, device="cuda")
+
+    def _ensure_device_penalty_count_metadata_buffers(self) -> None:
+        max_rows = (self.max_draft_len + 1) * self.max_num_requests
+        needs_alloc = (
+            self.device_penalty_count_row_slots is None
+            or self.device_penalty_count_seq_slots is None
+            or self.device_count_frequency_penalties is None
+            or self.device_count_seq_frequency_penalties is None
+            or self.device_penalty_count_reset_slots is None
+            or self.device_penalty_count_row_slots.shape != (max_rows, ))
+        if not needs_alloc:
+            return
+
+        self.device_penalty_count_row_slots = torch.zeros((max_rows, ),
+                                                          dtype=torch.int32,
+                                                          device="cuda")
+        self.device_penalty_count_seq_slots = torch.zeros(
+            (self.max_num_requests, ), dtype=torch.int32, device="cuda")
+        self.device_count_frequency_penalties = torch.zeros((max_rows, ),
+                                                            dtype=torch.float32,
+                                                            device="cuda")
+        self.device_count_seq_frequency_penalties = torch.zeros(
+            (self.max_num_requests, ), dtype=torch.float32, device="cuda")
+        self.device_penalty_count_reset_slots = torch.zeros(
+            (self.max_num_requests, ), dtype=torch.int64, device="cuda")
+
+    def ensure_device_penalty_count_buffers(self, vocab_size: int) -> None:
+        if vocab_size <= 0:
+            self.use_device_penalty_counts = False
+            return
+        owner = self._device_penalty_count_state_owner()
+        if owner is not self:
+            owner.device_penalty_count_mode = self.device_penalty_count_mode
+            owner.use_device_penalty_counts = self.use_device_penalty_counts
+            owner.ensure_device_penalty_count_buffers(vocab_size)
+            self._sync_device_penalty_count_state_from_owner()
+            return
+
+        slot_capacity = self._max_num_sequence_slots()
+        if self.device_penalty_count_mode == "dense":
+            if (self.device_penalty_token_counts is not None
+                    and self.device_penalty_count_vocab_size == vocab_size
+                    and self.device_penalty_token_counts.shape ==
+                    (slot_capacity, vocab_size)):
+                return
+
+            self.device_penalty_count_vocab_size = vocab_size
+            self.device_penalty_token_counts = torch.zeros(
+                (slot_capacity, vocab_size),
+                dtype=torch.int32,
+                device="cuda")
+            return
+
+        capacity_env = os.environ.get("TRTLLM_SPEC_SPARSE_COUNT_CAPACITY",
+                                      "").strip()
+        capacity = int(capacity_env) if capacity_env else 0
+        if capacity <= 0:
+            capacity = vocab_size
+        else:
+            capacity = min(capacity, vocab_size)
+        if (self.device_penalty_sparse_token_ids is not None
+                and self.device_penalty_sparse_token_counts is not None
+                and self.device_penalty_sparse_count_lens is not None
+                and self.device_penalty_count_vocab_size == vocab_size
+                and self.device_penalty_sparse_count_capacity == capacity
+                and self.device_penalty_sparse_token_ids.shape ==
+                (slot_capacity, capacity)):
+            return
+
+        self.device_penalty_count_vocab_size = vocab_size
+        self.device_penalty_sparse_count_capacity = capacity
+        self.device_penalty_sparse_token_ids = torch.zeros(
+            (slot_capacity, capacity), dtype=torch.int32, device="cuda")
+        self.device_penalty_sparse_token_counts = torch.zeros(
+            (slot_capacity, capacity), dtype=torch.int32, device="cuda")
+        self.device_penalty_sparse_count_lens = torch.zeros(
+            (slot_capacity, ), dtype=torch.int32, device="cuda")
+
+    def reset_device_penalty_count_slots(self) -> None:
+        if (not self.use_device_penalty_counts
+                or self.device_penalty_count_reset_slots is None
+                or self.device_penalty_count_reset_count == 0):
+            return
+        reset_slots = self.device_penalty_count_reset_slots[:
+                                                            self.device_penalty_count_reset_count]
+        reset_slots = reset_slots[(reset_slots >= 0)
+                                  &
+                                  (reset_slots < self._max_num_sequence_slots())]
+        if reset_slots.numel() == 0:
+            self.device_penalty_count_reset_count = 0
+            return
+        if self.device_penalty_count_mode == "dense":
+            if self.device_penalty_token_counts is None:
+                return
+            self.device_penalty_token_counts.index_fill_(0, reset_slots, 0)
+        else:
+            if self.device_penalty_sparse_count_lens is None:
+                return
+            self.device_penalty_sparse_count_lens.index_fill_(0, reset_slots, 0)
+        self.device_penalty_count_reset_count = 0
+
+    def init_device_penalty_count_prompt_tokens(self) -> None:
+        if (not self.use_device_penalty_counts
+                or self.device_penalty_count_prompt_tokens is None
+                or self.device_penalty_count_prompt_lens is None
+                or self.device_penalty_count_prompt_seq_slots is None
+                or self.device_penalty_count_prompt_count == 0):
+            return
+
+        count = self.device_penalty_count_prompt_count
+        if self.device_penalty_count_mode == "dense":
+            if self.device_penalty_token_counts is None:
+                return
+            from .one_model_sampler import append_accepted_tokens_to_counts
+            append_accepted_tokens_to_counts(
+                self.device_penalty_token_counts,
+                self.device_penalty_count_prompt_seq_slots[:count],
+                self.device_penalty_count_prompt_tokens[:count].contiguous(),
+                self.device_penalty_count_prompt_lens[:count].contiguous())
+        else:
+            if (self.device_penalty_sparse_token_ids is None
+                    or self.device_penalty_sparse_token_counts is None
+                    or self.device_penalty_sparse_count_lens is None
+                    or self.device_penalty_count_prompt_token_counts is None):
+                return
+            width = self.device_penalty_count_prompt_capacity
+            from .one_model_sampler import init_sparse_token_counts
+            init_sparse_token_counts(
+                self.device_penalty_sparse_token_ids,
+                self.device_penalty_sparse_token_counts,
+                self.device_penalty_sparse_count_lens,
+                self.device_penalty_count_prompt_tokens[:count, :
+                                                        width].contiguous(),
+                self.device_penalty_count_prompt_token_counts[:count, :
+                                                              width].contiguous(),
+                self.device_penalty_count_prompt_lens[:count].contiguous(),
+                self.device_penalty_count_prompt_seq_slots[:count].contiguous(),
+                self.device_penalty_count_vocab_size)
+        self.device_penalty_count_prompt_count = 0
+
+    def prepare_device_penalty_counts(self, vocab_size: int) -> None:
+        if not self.use_device_penalty_counts:
+            return
+        self.ensure_device_penalty_count_buffers(vocab_size)
+        self.reset_device_penalty_count_slots()
+        self.init_device_penalty_count_prompt_tokens()
+
+    def _ensure_device_penalty_count_prompt_buffers(
+            self, max_prompt_tokens: int) -> None:
+        if max_prompt_tokens <= 0:
+            return
+        if (self.device_penalty_count_prompt_tokens is not None
+                and self.device_penalty_count_prompt_token_counts is not None
+                and self.device_penalty_count_prompt_lens is not None
+                and self.device_penalty_count_prompt_seq_slots is not None
+                and self.device_penalty_count_prompt_capacity >= max_prompt_tokens):
+            return
+
+        self.device_penalty_count_prompt_capacity = max_prompt_tokens
+        self.device_penalty_count_prompt_tokens = torch.zeros(
+            (self.max_num_requests, max_prompt_tokens),
+            dtype=torch.int32,
+            device="cuda")
+        self.device_penalty_count_prompt_token_counts = torch.zeros(
+            (self.max_num_requests, max_prompt_tokens),
+            dtype=torch.int32,
+            device="cuda")
+        self.device_penalty_count_prompt_lens = torch.zeros(
+            (self.max_num_requests, ), dtype=torch.int32, device="cuda")
+        self.device_penalty_count_prompt_seq_slots = torch.zeros(
+            (self.max_num_requests, ), dtype=torch.int32, device="cuda")
+
+    def _populate_device_count_frequency_penalties(
+            self, requests: list["LlmRequest"]) -> bool:
+        debug_rows: list[dict[str, object]] = []
+
+        def debug_return(reason: str, **extra: object) -> None:
+            if int(os.environ.get("TRTLLM_SPEC_COUNT_DEBUG_TOKENS", "0")) <= 0:
+                return
+            if os.environ.get("TRTLLM_SPEC_COUNT_DEBUG_SKIP_ZERO", "1") == "1":
+                has_penalty_row = any(
+                    float(row.get("frequency_penalty", 0.0)) != 0.0
+                    or float(row.get("presence_penalty", 0.0)) != 0.0
+                    for row in debug_rows)
+                if not has_penalty_row and reason != "enabled":
+                    return
+            max_logs = max(
+                int(os.environ.get("TRTLLM_SPEC_COUNT_DEBUG_POPULATE_LOGS",
+                                   "8")), 0)
+            logs = int(getattr(self, "_device_count_populate_debug_logs", 0))
+            if logs >= max_logs:
+                return
+            setattr(self, "_device_count_populate_debug_logs", logs + 1)
+            extra.setdefault("max_num_requests", self.max_num_requests)
+            extra.setdefault("slot_capacity", self._max_num_sequence_slots())
+            logger.info(
+                "Spec count populate debug reason=%s env_use_counts=%s "
+                "env_mode=%s allow_advanced=%s spec_mode=%s use_one_engine=%s "
+                "request_count=%s rows=%s extra=%s",
+                reason,
+                os.environ.get("TRTLLM_SPEC_USE_DEVICE_COUNTS", "0"),
+                os.environ.get("TRTLLM_SPEC_COUNT_MODE", ""),
+                self.allow_advanced_sampling,
+                self.spec_dec_mode,
+                self.spec_dec_mode.use_one_engine(),
+                len(requests),
+                debug_rows,
+                extra)
+
+        if os.environ.get("TRTLLM_SPEC_USE_DEVICE_COUNTS", "0") != "1":
+            self.use_device_penalty_counts = False
+            debug_return("env_disabled")
+            return False
+        if not self.allow_advanced_sampling or not self.spec_dec_mode.use_one_engine(
+        ):
+            self.use_device_penalty_counts = False
+            debug_return("not_advanced_or_not_one_engine")
+            return False
+
+        row_slots: list[int] = []
+        seq_slots: list[int] = []
+        frequency_penalties: list[float] = []
+        seq_frequency_penalties: list[float] = []
+        reset_slots: list[int] = []
+        prompt_init_slots: list[int] = []
+        prompt_init_tokens: list[list[int]] = []
+        prompt_init_token_counts: list[list[int]] = []
+        any_penalty = False
+        can_use = True
+        count_mode = os.environ.get("TRTLLM_SPEC_COUNT_MODE",
+                                    "sparse").strip().lower()
+        if count_mode not in ("dense", "sparse"):
+            count_mode = "sparse"
+        force_graph_count_path = self._force_graph_count_path_enabled()
+        sparse_capacity_limit = int(
+            os.environ.get("TRTLLM_SPEC_SPARSE_COUNT_CAPACITY", "") or "0")
+
+        next_slot_request_ids: dict[int, int] = {}
+
+        for request in requests:
+            raw_slot = getattr(request, "py_seq_slot", None)
+            slot = int(raw_slot) if raw_slot is not None else -1
+            valid_slot = (self._valid_seq_slot(slot)
+                          and not getattr(request, "is_dummy", False))
+            effective_slot = slot if valid_slot else -1
+            seq_slots.append(effective_slot)
+            request_id = int(getattr(request, "py_request_id",
+                                     getattr(request, "request_id", -1)))
+            if valid_slot:
+                next_slot_request_ids[slot] = request_id
+
+            sampling_config = request.sampling_config
+            frequency_penalty = float(
+                self._sampling_config_value(sampling_config,
+                                            "frequency_penalty", 0.0))
+            presence_penalty = float(
+                self._sampling_config_value(sampling_config,
+                                            "presence_penalty", 0.0))
+            prompt_ignore_length = int(
+                self._sampling_config_value(sampling_config,
+                                            "prompt_ignore_length", 0))
+            raw_prompt_len = self._prompt_len(request)
+            if len(debug_rows) < 8:
+                state = getattr(request, "state", None)
+                debug_rows.append({
+                    "request_id": request_id,
+                    "slot": slot,
+                    "valid_slot": valid_slot,
+                    "state": getattr(state, "name", str(state)),
+                    "frequency_penalty": frequency_penalty,
+                    "presence_penalty": presence_penalty,
+                    "prompt_ignore_length": prompt_ignore_length,
+                    "raw_prompt_len": raw_prompt_len,
+                })
+
+            if presence_penalty != 0.0:
+                can_use = False
+                break
+
+            if not valid_slot:
+                frequency_penalty = 0.0
+
+            any_penalty = any_penalty or frequency_penalty != 0.0
+            is_new_slot_request = valid_slot and (
+                self.device_penalty_count_slot_request_ids.get(slot) != request_id)
+            if is_new_slot_request:
+                reset_slots.append(slot)
+                if frequency_penalty != 0.0:
+                    ignore_length = self._effective_prompt_ignore_length(
+                        request, prompt_ignore_length)
+                    tokens = request.get_tokens(0)
+                    count_history = [
+                        int(token) for token in tokens[ignore_length:]
+                        if int(token) >= 0
+                    ]
+                    if count_history:
+                        if count_mode == "sparse":
+                            counts = Counter(count_history)
+                            unique_tokens = list(counts.keys())
+                            if (sparse_capacity_limit > 0
+                                    and len(unique_tokens) >
+                                    sparse_capacity_limit):
+                                # Do not disable device-side generated-token
+                                # counts for the whole batch just because one
+                                # request history cannot fit in the sparse table.
+                                # The request still starts with an empty count
+                                # table, and accepted generated tokens are
+                                # appended below by the sampler.
+                                continue
+                            prompt_init_slots.append(slot)
+                            prompt_init_tokens.append(unique_tokens)
+                            prompt_init_token_counts.append([
+                                int(counts[token]) for token in unique_tokens
+                            ])
+                        else:
+                            prompt_init_slots.append(slot)
+                            prompt_init_tokens.append(count_history)
+                            prompt_init_token_counts.append([])
+
+            from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequestState
+            num_rows = 1 + self.runtime_draft_len if request.state == LlmRequestState.GENERATION_IN_PROGRESS else 1
+            row_slots.extend(effective_slot for _ in range(num_rows))
+            frequency_penalties.extend(frequency_penalty for _ in range(num_rows))
+            seq_frequency_penalties.append(frequency_penalty)
+
+        if not can_use or not row_slots or (not any_penalty
+                                            and not force_graph_count_path):
+            self.use_device_penalty_counts = False
+            debug_return("not_can_use_or_no_rows_or_no_penalty",
+                         can_use=can_use,
+                         row_count=len(row_slots),
+                         any_penalty=any_penalty,
+                         force_graph_count_path=force_graph_count_path)
+            return False
+
+        self.device_penalty_count_slot_request_ids.update(
+            next_slot_request_ids)
+
+        self.device_penalty_count_mode = count_mode
+        self._ensure_device_penalty_count_metadata_buffers()
+        assert self.device_penalty_count_row_slots is not None
+        assert self.device_penalty_count_seq_slots is not None
+        assert self.device_count_frequency_penalties is not None
+        assert self.device_count_seq_frequency_penalties is not None
+        assert self.device_penalty_count_reset_slots is not None
+        max_prompt_tokens = max((len(tokens) for tokens in prompt_init_tokens),
+                                default=0)
+        self._ensure_device_penalty_count_prompt_buffers(max_prompt_tokens)
+
+        self.device_penalty_count_seq_slots[:len(seq_slots)].copy_(
+            torch.tensor(seq_slots, dtype=torch.int32,
+                         pin_memory=prefer_pinned()),
+            non_blocking=True)
+        self.device_penalty_count_row_slots[:len(row_slots)].copy_(
+            torch.tensor(row_slots, dtype=torch.int32,
+                         pin_memory=prefer_pinned()),
+            non_blocking=True)
+        self.device_count_frequency_penalties[:len(frequency_penalties)].copy_(
+            torch.tensor(frequency_penalties,
+                         dtype=torch.float32,
+                         pin_memory=prefer_pinned()),
+            non_blocking=True)
+        self.device_count_seq_frequency_penalties[:len(seq_frequency_penalties
+                                                       )].copy_(
+                                                           torch.tensor(
+                                                               seq_frequency_penalties,
+                                                               dtype=torch.float32,
+                                                               pin_memory=prefer_pinned(
+                                                               )),
+                                                           non_blocking=True)
+        self.device_penalty_count_reset_count = len(reset_slots)
+        if reset_slots:
+            self.device_penalty_count_reset_slots[:len(reset_slots)].copy_(
+                torch.tensor(reset_slots,
+                             dtype=torch.int64,
+                             pin_memory=prefer_pinned()),
+                non_blocking=True)
+        self.device_penalty_count_prompt_count = len(prompt_init_tokens)
+        if prompt_init_tokens:
+            assert self.device_penalty_count_prompt_tokens is not None
+            assert self.device_penalty_count_prompt_token_counts is not None
+            assert self.device_penalty_count_prompt_lens is not None
+            assert self.device_penalty_count_prompt_seq_slots is not None
+            prompt_tensor = torch.zeros(
+                (len(prompt_init_tokens), max_prompt_tokens),
+                dtype=torch.int32,
+                pin_memory=prefer_pinned())
+            prompt_counts_tensor = torch.zeros(
+                (len(prompt_init_tokens), max_prompt_tokens),
+                dtype=torch.int32,
+                pin_memory=prefer_pinned())
+            prompt_lens = []
+            for row, tokens in enumerate(prompt_init_tokens):
+                prompt_lens.append(len(tokens))
+                prompt_tensor[row, :len(tokens)] = torch.tensor(
+                    tokens, dtype=torch.int32)
+                if count_mode == "sparse":
+                    prompt_counts_tensor[row, :len(tokens)] = torch.tensor(
+                        prompt_init_token_counts[row], dtype=torch.int32)
+            self.device_penalty_count_prompt_tokens[:len(prompt_init_tokens),
+                                                    :max_prompt_tokens].copy_(
+                                                        prompt_tensor,
+                                                        non_blocking=True)
+            if count_mode == "sparse":
+                self.device_penalty_count_prompt_token_counts[:len(
+                    prompt_init_tokens), :max_prompt_tokens].copy_(
+                        prompt_counts_tensor, non_blocking=True)
+            self.device_penalty_count_prompt_lens[:len(prompt_lens)].copy_(
+                torch.tensor(prompt_lens,
+                             dtype=torch.int32,
+                             pin_memory=prefer_pinned()),
+                non_blocking=True)
+            self.device_penalty_count_prompt_seq_slots[:len(
+                prompt_init_slots)].copy_(torch.tensor(
+                    prompt_init_slots,
+                    dtype=torch.int32,
+                    pin_memory=prefer_pinned()),
+                                          non_blocking=True)
+        self.use_device_penalty_counts = True
+        debug_return("enabled",
+                     row_count=len(row_slots),
+                     any_penalty=any_penalty,
+                     force_graph_count_path=force_graph_count_path,
+                     count_mode=count_mode,
+                     seq_slots_head=seq_slots[:32],
+                     row_slots_head=row_slots[:64],
+                     reset_slots=reset_slots[:32],
+                     prompt_init_slots=prompt_init_slots[:32],
+                     prompt_init_count=len(prompt_init_tokens))
+        return True
+
+    def _populate_device_history_frequency_penalties(
+            self, requests: list["LlmRequest"]) -> bool:
+        if os.environ.get("TRTLLM_SPEC_USE_DEVICE_HISTORY", "0") != "1":
+            self.use_device_penalty_history = False
+            return False
+        if not self.allow_advanced_sampling or not self.spec_dec_mode.use_one_engine(
+        ):
+            self.use_device_penalty_history = False
+            return False
+
+        row_slots: list[int] = []
+        seq_slots: list[int] = []
+        frequency_penalties: list[float] = []
+        seq_frequency_penalties: list[float] = []
+        reset_slots: list[int] = []
+        can_use = True
+        row_mode = os.environ.get("TRTLLM_SPEC_PENALTY_ROW_MODE",
+                                  "all").strip().lower()
+        if row_mode not in ("all", "root"):
+            row_mode = "all"
+
+        for request in requests:
+            raw_slot = getattr(request, "py_seq_slot", None)
+            slot = int(raw_slot) if raw_slot is not None else -1
+            valid_slot = (self._valid_seq_slot(slot)
+                          and not getattr(request, "is_dummy", False))
+            effective_slot = slot if valid_slot else -1
+            seq_slots.append(effective_slot)
+
+            sampling_config = request.sampling_config
+            frequency_penalty = float(
+                self._sampling_config_value(sampling_config,
+                                            "frequency_penalty", 0.0))
+            presence_penalty = float(
+                self._sampling_config_value(sampling_config,
+                                            "presence_penalty", 0.0))
+            prompt_ignore_length = int(
+                self._sampling_config_value(sampling_config,
+                                            "prompt_ignore_length", 0))
+            raw_prompt_len = self._prompt_len(request)
+
+            # The device-history fast path intentionally covers the current
+            # NVBug workload: frequency penalty over generated tokens only.
+            # Other token-history semantics fall back to the slower probe path.
+            if presence_penalty != 0.0 or prompt_ignore_length < raw_prompt_len:
+                can_use = False
+                break
+
+            if not valid_slot:
+                frequency_penalty = 0.0
+            seq_frequency_penalties.append(frequency_penalty)
+
+            generated_len = max(request.get_num_tokens(0) - raw_prompt_len, 0)
+            if valid_slot and generated_len == 0:
+                reset_slots.append(slot)
+
+            from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequestState
+            num_rows = 1 + self.runtime_draft_len if request.state == LlmRequestState.GENERATION_IN_PROGRESS else 1
+            row_slots.append(effective_slot)
+            frequency_penalties.append(frequency_penalty)
+            if num_rows > 1:
+                if row_mode == "all":
+                    row_slots.extend(effective_slot for _ in range(num_rows - 1))
+                    frequency_penalties.extend(frequency_penalty
+                                               for _ in range(num_rows - 1))
+                else:
+                    row_slots.extend(-1 for _ in range(num_rows - 1))
+                    frequency_penalties.extend(0.0 for _ in range(num_rows - 1))
+
+        if not can_use or not row_slots:
+            self.use_device_penalty_history = False
+            return False
+
+        self._ensure_device_penalty_history_buffers()
+        if not self.use_device_penalty_history and self.device_penalty_history_tokens is None:
+            return False
+        assert self.device_penalty_history_tokens is not None
+        assert self.device_penalty_history_lens is not None
+        assert self.device_penalty_row_slots is not None
+        assert self.device_penalty_seq_slots is not None
+        assert self.device_frequency_penalties is not None
+        assert self.device_seq_frequency_penalties is not None
+
+        if reset_slots:
+            reset_slots_cuda = torch.tensor(reset_slots,
+                                            dtype=torch.int64,
+                                            pin_memory=prefer_pinned()).to(
+                                                "cuda", non_blocking=True)
+            self.device_penalty_history_lens.index_fill_(0, reset_slots_cuda, 0)
+
+        self.device_penalty_seq_slots[:len(seq_slots)].copy_(
+            torch.tensor(seq_slots, dtype=torch.int32,
+                         pin_memory=prefer_pinned()),
+            non_blocking=True)
+        self.device_penalty_row_slots[:len(row_slots)].copy_(
+            torch.tensor(row_slots, dtype=torch.int32,
+                         pin_memory=prefer_pinned()),
+            non_blocking=True)
+        self.device_frequency_penalties[:len(frequency_penalties)].copy_(
+            torch.tensor(frequency_penalties,
+                         dtype=torch.float32,
+                         pin_memory=prefer_pinned()),
+            non_blocking=True)
+        self.device_seq_frequency_penalties[:len(seq_frequency_penalties)].copy_(
+            torch.tensor(seq_frequency_penalties,
+                         dtype=torch.float32,
+                         pin_memory=prefer_pinned()),
+            non_blocking=True)
+        self.use_device_penalty_history = True
+        return True
+
+    def append_accepted_tokens_to_penalty_history(
+            self, accepted_tokens: torch.Tensor,
+            num_accepted_tokens: torch.Tensor, batch_size: int) -> None:
+        if (self.use_device_penalty_counts
+                and self.device_penalty_count_seq_slots is not None):
+            if (self.device_penalty_count_mode == "dense"
+                    and self.device_penalty_token_counts is not None):
+                from .one_model_sampler import append_accepted_tokens_to_counts
+                append_accepted_tokens_to_counts(
+                    self.device_penalty_token_counts,
+                    self.device_penalty_count_seq_slots[:batch_size],
+                    accepted_tokens[:batch_size].contiguous(),
+                    num_accepted_tokens[:batch_size].contiguous())
+                return
+            if (self.device_penalty_count_mode == "sparse"
+                    and self.device_penalty_sparse_token_ids is not None
+                    and self.device_penalty_sparse_token_counts is not None
+                    and self.device_penalty_sparse_count_lens is not None):
+                from .one_model_sampler import append_accepted_tokens_to_sparse_counts
+                append_accepted_tokens_to_sparse_counts(
+                    self.device_penalty_sparse_token_ids,
+                    self.device_penalty_sparse_token_counts,
+                    self.device_penalty_sparse_count_lens,
+                    self.device_penalty_count_seq_slots[:batch_size],
+                    accepted_tokens[:batch_size].contiguous(),
+                    num_accepted_tokens[:batch_size].contiguous(),
+                    self.device_penalty_count_vocab_size)
+                return
+
+        if not self.use_device_penalty_history:
+            return
+        if self.device_penalty_history_tokens is None:
+            return
+        assert self.device_penalty_history_lens is not None
+        assert self.device_penalty_seq_slots is not None
+
+        from .one_model_sampler import append_accepted_tokens_to_history
+        append_accepted_tokens_to_history(
+            self.device_penalty_history_tokens,
+            self.device_penalty_history_lens,
+            self.device_penalty_seq_slots[:batch_size],
+            accepted_tokens[:batch_size].contiguous(),
+            num_accepted_tokens[:batch_size].contiguous())
+
+    def _populate_recent_token_penalties_for_one_model(
+            self, requests: list["LlmRequest"]) -> None:
+        if not self.allow_advanced_sampling or not self.spec_dec_mode.use_one_engine(
+        ):
+            return
+
+        width = int(os.environ.get("TRTLLM_SPEC_RECENT_PENALTY_TOKENS", "0"))
+        width = max(width, 0)
+        if width == 0:
+            return
+
+        row_token_ids: list[list[int]] = []
+        row_penalty_values: list[list[float]] = []
+        seq_token_ids: list[list[int]] = []
+        seq_penalty_values: list[list[float]] = []
+        any_penalty = False
+
+        for request in requests:
+            sampling_config = request.sampling_config
+            frequency_penalty = float(
+                self._sampling_config_value(sampling_config,
+                                            "frequency_penalty", 0.0))
+            presence_penalty = float(
+                self._sampling_config_value(sampling_config,
+                                            "presence_penalty", 0.0))
+            prompt_ignore_length = int(
+                self._sampling_config_value(sampling_config,
+                                            "prompt_ignore_length", 0))
+
+            from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequestState
+            num_rows = 1 + self.runtime_draft_len if request.state == LlmRequestState.GENERATION_IN_PROGRESS else 1
+
+            if frequency_penalty == 0.0 and presence_penalty == 0.0:
+                ids = [0] * width
+                penalties = [0.0] * width
+            else:
+                prompt_ignore_length = self._effective_prompt_ignore_length(
+                    request, prompt_ignore_length)
+                tokens = request.get_tokens(0)
+                recent_start = max(prompt_ignore_length, len(tokens) - width)
+                counts: dict[int, int] = {}
+                for token in tokens[recent_start:]:
+                    if token < 0:
+                        continue
+                    counts[token] = counts.get(token, 0) + 1
+                items = list(counts.items())[:width]
+                ids = [token for token, _ in items]
+                penalties = [
+                    presence_penalty + frequency_penalty * count
+                    for _, count in items
+                ]
+                if penalties:
+                    any_penalty = True
+                pad = width - len(ids)
+                if pad > 0:
+                    ids.extend([0] * pad)
+                    penalties.extend([0.0] * pad)
+
+            for _ in range(num_rows):
+                row_token_ids.append(ids)
+                row_penalty_values.append(penalties)
+            seq_token_ids.append(ids)
+            seq_penalty_values.append(penalties)
+
+        if not row_token_ids:
+            return
+
+        self._ensure_recent_penalty_buffers(width)
+        assert self.recent_penalty_token_ids is not None
+        assert self.recent_penalty_values is not None
+        assert self.recent_seq_penalty_token_ids is not None
+        assert self.recent_seq_penalty_values is not None
+
+        num_rows = len(row_token_ids)
+        num_seqs = len(seq_token_ids)
+        if not any_penalty:
+            self.recent_penalty_values[:num_rows].zero_()
+            self.recent_seq_penalty_values[:num_seqs].zero_()
+            return
+
+        self.recent_penalty_token_ids[:num_rows].copy_(
+            torch.tensor(row_token_ids,
+                         dtype=torch.long,
+                         pin_memory=prefer_pinned()),
+            non_blocking=True)
+        self.recent_penalty_values[:num_rows].copy_(
+            torch.tensor(row_penalty_values,
+                         dtype=torch.float32,
+                         pin_memory=prefer_pinned()),
+            non_blocking=True)
+        self.recent_seq_penalty_token_ids[:num_seqs].copy_(
+            torch.tensor(seq_token_ids,
+                         dtype=torch.long,
+                         pin_memory=prefer_pinned()),
+            non_blocking=True)
+        self.recent_seq_penalty_values[:num_seqs].copy_(
+            torch.tensor(seq_penalty_values,
+                         dtype=torch.float32,
+                         pin_memory=prefer_pinned()),
+            non_blocking=True)
+
     def populate_sampling_params_for_one_model(
             self, requests: list["LlmRequest"]) -> None:
         """
@@ -498,6 +1435,10 @@ def populate_sampling_params_for_one_model(
         self.top_ps[:len(top_ps)].copy_(torch.tensor(
             top_ps, dtype=torch.float32, pin_memory=prefer_pinned()),
                                         non_blocking=True)
+        if self._populate_device_count_frequency_penalties(requests):
+            return
+        if not self._populate_device_history_frequency_penalties(requests):
+            self._populate_recent_token_penalties_for_one_model(requests)
 
 
 class SpecWorkerBase(nn.Module, ABC):
@@ -547,13 +1488,14 @@ def skip_forward(
         next_new_tokens = torch.empty((batch_size, (self.max_draft_len + 1)),
                                       dtype=torch.int,
                                       device=logits.device)
-        return {
+        outputs = {
             'logits': logits,
             'new_tokens': accepted_tokens,
             'new_tokens_lens': num_accepted_tokens,
             'next_draft_tokens': next_draft_tokens,
             'next_new_tokens': next_new_tokens
         }
+        return self._add_penalty_history_outputs(outputs, spec_metadata)
 
     def skip_drafting(
         self,
@@ -585,7 +1527,6 @@ def skip_drafting(
         num_accepted_tokens = torch.ones(batch_size,
                                          dtype=torch.int,
                                          device=logits.device)
-
         next_draft_tokens = torch.zeros((batch_size, 0),
                                         dtype=torch.int,
                                         device=logits.device)
@@ -595,13 +1536,14 @@ def skip_drafting(
                                       device=logits.device)
         next_new_tokens[:, 0] = target_tokens
 
-        return {
+        outputs = {
             'logits': logits,
             'new_tokens': accepted_tokens,
             'new_tokens_lens': num_accepted_tokens,
             'next_draft_tokens': next_draft_tokens,
             'next_new_tokens': next_new_tokens
         }
+        return self._add_penalty_history_outputs(outputs, spec_metadata)
 
     def set_guided_decoder(self,
                            guided_decoder: "CapturableGuidedDecoder") -> bool:
@@ -688,8 +1630,12 @@ def _sample_and_accept_draft_tokens_base(
                                          device=logits.device)
 
         # Sample tokens using per-request sampling parameters
-        target_tokens = self._sample_tokens_for_batch(logits, spec_metadata,
-                                                      num_contexts, batch_size)
+        target_tokens = self._sample_tokens_for_batch(
+            logits,
+            spec_metadata,
+            num_contexts,
+            batch_size,
+            draft_tokens=draft_tokens)
 
         # Context requests: only accept the sampled token (no draft tokens yet)
         accepted_tokens[:num_contexts, 0] = target_tokens[:num_contexts]
@@ -712,6 +1658,38 @@ def _sample_and_accept_draft_tokens_base(
 
         return accepted_tokens, num_accepted_tokens
 
+    @staticmethod
+    def _add_penalty_history_outputs(outputs: dict[str, torch.Tensor],
+                                     spec_metadata: SpecMetadata):
+        if spec_metadata.sampling_request_ids is not None:
+            outputs["penalty_sampling_request_ids"] = spec_metadata.sampling_request_ids
+        if spec_metadata.sampling_seq_slots is not None:
+            outputs["penalty_sampling_seq_slots"] = spec_metadata.sampling_seq_slots
+        if spec_metadata.use_device_penalty_counts:
+            if spec_metadata.device_penalty_count_seq_slots is not None:
+                outputs[
+                    "penalty_count_seq_slots"] = spec_metadata.device_penalty_count_seq_slots
+            if (spec_metadata.device_penalty_count_mode == "dense"
+                    and spec_metadata.device_penalty_token_counts is not None):
+                outputs["penalty_token_counts"] = spec_metadata.device_penalty_token_counts
+            elif (spec_metadata.device_penalty_count_mode == "sparse"
+                  and spec_metadata.device_penalty_sparse_token_ids is not None
+                  and spec_metadata.device_penalty_sparse_token_counts is not None
+                  and spec_metadata.device_penalty_sparse_count_lens is not None):
+                outputs["penalty_sparse_token_ids"] = spec_metadata.device_penalty_sparse_token_ids
+                outputs["penalty_sparse_token_counts"] = spec_metadata.device_penalty_sparse_token_counts
+                outputs["penalty_sparse_count_lens"] = spec_metadata.device_penalty_sparse_count_lens
+                outputs["penalty_count_vocab_size"] = spec_metadata.device_penalty_count_vocab_size
+        if (spec_metadata.use_device_penalty_history
+                and spec_metadata.device_penalty_history_tokens is not None
+                and spec_metadata.device_penalty_history_lens is not None):
+            if spec_metadata.device_penalty_seq_slots is not None:
+                outputs[
+                    "penalty_history_seq_slots"] = spec_metadata.device_penalty_seq_slots
+            outputs["penalty_history_tokens"] = spec_metadata.device_penalty_history_tokens
+            outputs["penalty_history_lens"] = spec_metadata.device_penalty_history_lens
+        return outputs
+
     def _draft_sampler_greedy(self, logits: torch.Tensor, d2t=None):
         """
         Simple greedy draft token sampling using argmax.
@@ -731,11 +1709,261 @@ def _draft_sampler_greedy(self, logits: torch.Tensor, d2t=None):
 
         return draft_tokens.type(torch.int32)
 
+    @staticmethod
+    def _draft_prefix_frequency_penalties(spec_metadata: SpecMetadata):
+        if spec_metadata.device_count_seq_frequency_penalties is not None:
+            return spec_metadata.device_count_seq_frequency_penalties
+        if spec_metadata.device_seq_frequency_penalties is not None:
+            return spec_metadata.device_seq_frequency_penalties
+        return None
+
+    def _apply_draft_prefix_penalty_values(
+            self, logits: torch.Tensor, spec_metadata: SpecMetadata,
+            row_token_ids: torch.Tensor, row_penalty_values: torch.Tensor,
+            num_rows: int, width: int) -> torch.Tensor:
+        if num_rows <= 0 or width <= 0:
+            return logits
+        if row_token_ids is None or row_penalty_values is None:
+            return logits
+        if os.environ.get("TRTLLM_SPEC_USE_PENALTY_OP", "0") == "1":
+            from .one_model_sampler import apply_recent_token_penalties
+            return apply_recent_token_penalties(
+                logits, row_token_ids[:num_rows, :width],
+                row_penalty_values[:num_rows, :width])
+        # scatter_add handles duplicate prefix tokens in the same row correctly.
+        logits.scatter_add_(1, row_token_ids[:num_rows, :width].long(),
+                            -row_penalty_values[:num_rows, :width].to(
+                                logits.dtype))
+        return logits
+
+    def _apply_target_draft_prefix_frequency_penalty(
+            self, logits: torch.Tensor, spec_metadata: SpecMetadata,
+            num_contexts: int, batch_size: int,
+            draft_tokens: Optional[torch.Tensor]) -> torch.Tensor:
+        if os.environ.get("TRTLLM_SPEC_APPLY_DRAFT_PREFIX_PENALTY",
+                          "0") != "1":
+            return logits
+        if draft_tokens is None or draft_tokens.numel() == 0:
+            return logits
+        runtime_draft_len = int(draft_tokens.shape[-1])
+        if runtime_draft_len <= 0:
+            return logits
+        frequency_penalties = self._draft_prefix_frequency_penalties(
+            spec_metadata)
+        if frequency_penalties is None:
+            return logits
+        num_gens = batch_size - num_contexts
+        if num_gens <= 0:
+            return logits
+
+        num_tokens = num_contexts + num_gens * (runtime_draft_len + 1)
+        spec_metadata._ensure_draft_prefix_penalty_buffers(runtime_draft_len)
+        token_ids = spec_metadata.draft_prefix_penalty_token_ids
+        penalty_values = spec_metadata.draft_prefix_penalty_values
+        rows = spec_metadata.draft_prefix_penalty_rows
+        assert token_ids is not None
+        assert penalty_values is not None
+        assert rows is not None
+
+        token_ids[:num_tokens, :runtime_draft_len].zero_()
+        penalty_values[:num_tokens, :runtime_draft_len].zero_()
+        gen_frequency_penalties = frequency_penalties[
+            num_contexts:batch_size].to(torch.float32)
+        gen_rows = rows[:num_gens]
+        row_stride = runtime_draft_len + 1
+        for pos in range(1, runtime_draft_len + 1):
+            target_rows = num_contexts + gen_rows * row_stride + pos
+            token_ids[target_rows, :pos].copy_(draft_tokens[:, :pos])
+            penalty_values[target_rows, :pos].copy_(
+                gen_frequency_penalties.unsqueeze(1).expand(-1, pos))
+        return self._apply_draft_prefix_penalty_values(
+            logits, spec_metadata, token_ids, penalty_values, num_tokens,
+            runtime_draft_len)
+
+    def _apply_draft_step_prefix_frequency_penalty(
+            self, logits: torch.Tensor, spec_metadata: SpecMetadata,
+            batch_size: int,
+            draft_prefix_tokens: Optional[torch.Tensor]) -> torch.Tensor:
+        if os.environ.get("TRTLLM_SPEC_APPLY_DRAFT_PREFIX_PENALTY",
+                          "0") != "1":
+            return logits
+        if draft_prefix_tokens is None or draft_prefix_tokens.numel() == 0:
+            return logits
+        prefix_len = int(draft_prefix_tokens.shape[-1])
+        if prefix_len <= 0:
+            return logits
+        frequency_penalties = self._draft_prefix_frequency_penalties(
+            spec_metadata)
+        if frequency_penalties is None:
+            return logits
+
+        spec_metadata._ensure_draft_prefix_penalty_buffers(prefix_len)
+        token_ids = spec_metadata.draft_prefix_penalty_token_ids
+        penalty_values = spec_metadata.draft_prefix_penalty_values
+        assert token_ids is not None
+        assert penalty_values is not None
+
+        token_ids[:batch_size, :prefix_len].copy_(
+            draft_prefix_tokens[:batch_size, :prefix_len])
+        penalty_values[:batch_size, :prefix_len].copy_(
+            frequency_penalties[:batch_size].to(torch.float32).unsqueeze(1).
+            expand(-1, prefix_len))
+        return self._apply_draft_prefix_penalty_values(
+            logits, spec_metadata, token_ids, penalty_values, batch_size,
+            prefix_len)
+
+    def _maybe_apply_history_penalty_to_draft_logits(
+            self, logits: torch.Tensor, spec_metadata: SpecMetadata,
+            batch_size: int, d2t=None,
+            draft_prefix_tokens: Optional[torch.Tensor] = None):
+        if os.environ.get("TRTLLM_SPEC_APPLY_HISTORY_TO_DRAFT", "0") != "1":
+            return self._apply_draft_step_prefix_frequency_penalty(
+                logits, spec_metadata, batch_size, draft_prefix_tokens)
+        if d2t is not None:
+            return logits
+        if (spec_metadata.use_device_penalty_counts
+                and spec_metadata.device_penalty_count_seq_slots is not None
+                and spec_metadata.device_count_seq_frequency_penalties is not None):
+            from .one_model_sampler import (apply_count_frequency_penalty,
+                                            apply_sparse_count_frequency_penalty)
+            spec_metadata.ensure_device_penalty_count_buffers(
+                int(logits.shape[-1]))
+            if spec_metadata.device_penalty_count_mode == "dense":
+                if spec_metadata.device_penalty_token_counts is None:
+                    return logits
+                logits = apply_count_frequency_penalty(
+                    logits, spec_metadata.device_penalty_token_counts,
+                    spec_metadata.device_penalty_count_seq_slots[:batch_size],
+                    spec_metadata.device_count_seq_frequency_penalties[:batch_size])
+                return self._apply_draft_step_prefix_frequency_penalty(
+                    logits, spec_metadata, batch_size, draft_prefix_tokens)
+            if (spec_metadata.device_penalty_sparse_token_ids is None
+                    or spec_metadata.device_penalty_sparse_token_counts is None
+                    or spec_metadata.device_penalty_sparse_count_lens is None):
+                return logits
+            logits = apply_sparse_count_frequency_penalty(
+                logits, spec_metadata.device_penalty_sparse_token_ids,
+                spec_metadata.device_penalty_sparse_token_counts,
+                spec_metadata.device_penalty_sparse_count_lens,
+                spec_metadata.device_penalty_count_seq_slots[:batch_size],
+                spec_metadata.device_count_seq_frequency_penalties[:batch_size])
+            return self._apply_draft_step_prefix_frequency_penalty(
+                logits, spec_metadata, batch_size, draft_prefix_tokens)
+        if (spec_metadata.recent_seq_penalty_token_ids is not None
+                and spec_metadata.recent_seq_penalty_values is not None):
+            from .one_model_sampler import apply_recent_token_penalties
+            logits = apply_recent_token_penalties(
+                logits,
+                spec_metadata.recent_seq_penalty_token_ids[:batch_size],
+                spec_metadata.recent_seq_penalty_values[:batch_size])
+            return self._apply_draft_step_prefix_frequency_penalty(
+                logits, spec_metadata, batch_size, draft_prefix_tokens)
+        if (not spec_metadata.use_device_penalty_history
+                or logits.dtype != torch.float32
+                or spec_metadata.device_penalty_history_tokens is None
+                or spec_metadata.device_penalty_history_lens is None
+                or spec_metadata.device_penalty_seq_slots is None
+                or spec_metadata.device_seq_frequency_penalties is None):
+            return logits
+
+        from .one_model_sampler import apply_history_frequency_penalty
+        logits = apply_history_frequency_penalty(
+            logits, spec_metadata.device_penalty_history_tokens,
+            spec_metadata.device_penalty_history_lens,
+            spec_metadata.device_penalty_seq_slots[:batch_size],
+            spec_metadata.device_seq_frequency_penalties[:batch_size])
+        return self._apply_draft_step_prefix_frequency_penalty(
+            logits, spec_metadata, batch_size, draft_prefix_tokens)
+
     def _execute_guided_decoder_if_present(self, logits):
         """Execute guided decoder on target model logits if available."""
         if self.guided_decoder is not None:
             self.guided_decoder.execute(logits)
 
+    def _debug_sampled_count_penalty(
+            self, logits: torch.Tensor, sampled_tokens: torch.Tensor,
+            spec_metadata: SpecMetadata, num_tokens: int) -> None:
+        if spec_metadata.is_cuda_graph:
+            return
+        threshold = int(
+            os.environ.get("TRTLLM_SPEC_SAMPLE_DEBUG_COUNT_THRESHOLD", "0"))
+        if threshold <= 0:
+            return
+        max_logs = max(
+            int(os.environ.get("TRTLLM_SPEC_SAMPLE_DEBUG_MAX_LOGS", "64")), 0)
+        if max_logs <= 0:
+            return
+        logs = int(getattr(self, "_sample_count_debug_logs", 0))
+        if logs >= max_logs:
+            return
+        if (spec_metadata.device_penalty_count_mode != "sparse"
+                or spec_metadata.device_penalty_count_row_slots is None
+                or spec_metadata.device_count_frequency_penalties is None
+                or spec_metadata.device_penalty_sparse_token_ids is None
+                or spec_metadata.device_penalty_sparse_token_counts is None
+                or spec_metadata.device_penalty_sparse_count_lens is None):
+            return
+
+        row_limit = min(
+            num_tokens,
+            int(os.environ.get("TRTLLM_SPEC_SAMPLE_DEBUG_ROW_LIMIT", "256")))
+        if row_limit <= 0:
+            return
+
+        row_slots = spec_metadata.device_penalty_count_row_slots[:row_limit].detach(
+        ).cpu().tolist()
+        frequency_penalties = spec_metadata.device_count_frequency_penalties[:
+                                                                            row_limit].detach(
+                                                                            ).cpu(
+                                                                            ).tolist(
+                                                                            )
+        sampled = sampled_tokens[:row_limit].detach().cpu().tolist()
+        count_lens = spec_metadata.device_penalty_sparse_count_lens.detach(
+        ).cpu()
+        debug_rows = []
+
+        for row, (slot, token, penalty) in enumerate(
+                zip(row_slots, sampled, frequency_penalties)):
+            if logs + len(debug_rows) >= max_logs:
+                break
+            slot = int(slot)
+            token = int(token)
+            penalty = float(penalty)
+            if slot < 0 or penalty == 0.0 or slot >= count_lens.numel():
+                continue
+            count_len = int(count_lens[slot].item())
+            if count_len <= 0:
+                continue
+            token_ids = spec_metadata.device_penalty_sparse_token_ids[
+                slot, :count_len].detach().cpu()
+            matches = (token_ids == token).nonzero(as_tuple=False)
+            if matches.numel() == 0:
+                continue
+            count_index = int(matches[0].item())
+            count = int(spec_metadata.device_penalty_sparse_token_counts[
+                slot, count_index].detach().cpu().item())
+            if count < threshold:
+                continue
+            row_logits = logits[row].detach().float()
+            selected_logit = float(row_logits[token].item())
+            max_logit = float(row_logits.max().item())
+            debug_rows.append({
+                "row": row,
+                "slot": slot,
+                "token": token,
+                "count": count,
+                "count_len": count_len,
+                "frequency_penalty": penalty,
+                "selected_logit": selected_logit,
+                "row_max_logit": max_logit,
+                "selected_minus_max": selected_logit - max_logit,
+            })
+
+        if not debug_rows:
+            return
+        setattr(self, "_sample_count_debug_logs", logs + len(debug_rows))
+        logger.info("Spec sample count penalty debug rows=%s", debug_rows)
+
     def _prepare_next_new_tokens(self, accepted_tokens, next_draft_tokens,
                                  batch_indices_cuda, batch_size,
                                  num_accepted_tokens):
@@ -846,6 +2074,7 @@ def _sample_tokens_for_batch(
         spec_metadata: SpecMetadata,
         num_contexts: int,
         batch_size: int,
+        draft_tokens: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """
         Sample tokens from logits using per-request sampling parameters.
@@ -861,7 +2090,11 @@ def _sample_tokens_for_batch(
             sampled_tokens: [num_tokens] - Sampled token ids
         """
         if spec_metadata.allow_advanced_sampling:
-            from .one_model_sampler import sampling_batch_spec_dec_one_model
+            from .one_model_sampler import (apply_count_frequency_penalty,
+                                            apply_history_frequency_penalty,
+                                            apply_recent_token_penalties,
+                                            apply_sparse_count_frequency_penalty,
+                                            sampling_batch_spec_dec_one_model)
 
             num_gens = batch_size - num_contexts
             num_tokens = num_contexts + num_gens * (
@@ -870,6 +2103,56 @@ def _sample_tokens_for_batch(
             temperatures = spec_metadata.temperatures[:num_tokens]
             top_ks = spec_metadata.top_ks[:num_tokens]
             top_ps = spec_metadata.top_ps[:num_tokens]
+            if (spec_metadata.use_device_penalty_counts
+                    and spec_metadata.device_penalty_count_row_slots is not None
+                    and spec_metadata.device_count_frequency_penalties is not None):
+                spec_metadata.ensure_device_penalty_count_buffers(
+                    int(logits.shape[-1]))
+                if (spec_metadata.device_penalty_count_mode == "dense"
+                        and spec_metadata.device_penalty_token_counts is not None):
+                    logits = apply_count_frequency_penalty(
+                        logits,
+                        spec_metadata.device_penalty_token_counts,
+                        spec_metadata.device_penalty_count_row_slots[:num_tokens],
+                        spec_metadata.device_count_frequency_penalties[:
+                                                                      num_tokens])
+                elif (spec_metadata.device_penalty_count_mode == "sparse"
+                      and spec_metadata.device_penalty_sparse_token_ids is not None
+                      and spec_metadata.device_penalty_sparse_token_counts is not None
+                      and spec_metadata.device_penalty_sparse_count_lens is not None):
+                    logits = apply_sparse_count_frequency_penalty(
+                        logits,
+                        spec_metadata.device_penalty_sparse_token_ids,
+                        spec_metadata.device_penalty_sparse_token_counts,
+                        spec_metadata.device_penalty_sparse_count_lens,
+                        spec_metadata.device_penalty_count_row_slots[:num_tokens],
+                        spec_metadata.device_count_frequency_penalties[:
+                                                                      num_tokens])
+            elif (spec_metadata.use_device_penalty_history
+                    and logits.dtype == torch.float32
+                    and spec_metadata.device_penalty_history_tokens is not None
+                    and spec_metadata.device_penalty_history_lens is not None
+                    and spec_metadata.device_penalty_row_slots is not None
+                    and spec_metadata.device_frequency_penalties is not None):
+                logits = apply_history_frequency_penalty(
+                    logits,
+                    spec_metadata.device_penalty_history_tokens,
+                    spec_metadata.device_penalty_history_lens,
+                    spec_metadata.device_penalty_row_slots[:num_tokens],
+                    spec_metadata.device_frequency_penalties[:num_tokens])
+            else:
+                recent_penalty_token_ids = (
+                    None if spec_metadata.recent_penalty_token_ids is None else
+                    spec_metadata.recent_penalty_token_ids[:num_tokens])
+                recent_penalty_values = (
+                    None if spec_metadata.recent_penalty_values is None else
+                    spec_metadata.recent_penalty_values[:num_tokens])
+                if recent_penalty_token_ids is not None and recent_penalty_values is not None:
+                    logits = apply_recent_token_penalties(
+                        logits, recent_penalty_token_ids, recent_penalty_values)
+
+            logits = self._apply_target_draft_prefix_frequency_penalty(
+                logits, spec_metadata, num_contexts, batch_size, draft_tokens)
 
             if self.use_flashinfer:
                 top_ks = top_ks.clamp(min=1, max=logits.shape[-1] - 1)
@@ -892,6 +2175,8 @@ def _sample_tokens_for_batch(
                 use_flashinfer=self.use_flashinfer,
                 seed=self.seed,
                 offset=self.offset)
+            self._debug_sampled_count_penalty(logits, sampled_tokens,
+                                              spec_metadata, num_tokens)
         else:
             sampled_tokens = torch.argmax(logits, dim=-1)
 
diff --git a/tensorrt_llm/_torch/speculative/one_model_sampler.py b/tensorrt_llm/_torch/speculative/one_model_sampler.py
index 7d49aa85dd1..e337ee84b54 100644
--- a/tensorrt_llm/_torch/speculative/one_model_sampler.py
+++ b/tensorrt_llm/_torch/speculative/one_model_sampler.py
@@ -1,3 +1,4 @@
+import os
 from typing import Optional
 
 import torch
@@ -73,6 +74,106 @@ def apply_temperature(
     return logits.div_(temp.unsqueeze(dim=1))
 
 
+def apply_recent_token_penalties(
+    logits: torch.Tensor,
+    token_ids: torch.Tensor,
+    penalty_values: torch.Tensor,
+) -> torch.Tensor:
+    if os.environ.get("TRTLLM_SPEC_USE_PENALTY_OP", "0") == "1":
+        torch.ops.trtllm.speculative_apply_token_penalties(
+            logits, token_ids, penalty_values)
+        return logits
+    return logits.scatter_add_(1, token_ids, -penalty_values.to(logits.dtype))
+
+
+def apply_history_frequency_penalty(
+    logits: torch.Tensor,
+    history_tokens: torch.Tensor,
+    history_lens: torch.Tensor,
+    row_slots: torch.Tensor,
+    frequency_penalties: torch.Tensor,
+) -> torch.Tensor:
+    torch.ops.trtllm.speculative_apply_history_frequency_penalty(
+        logits, history_tokens, history_lens, row_slots, frequency_penalties)
+    return logits
+
+
+def apply_count_frequency_penalty(
+    logits: torch.Tensor,
+    token_counts: torch.Tensor,
+    row_slots: torch.Tensor,
+    frequency_penalties: torch.Tensor,
+) -> torch.Tensor:
+    torch.ops.trtllm.speculative_apply_count_frequency_penalty(
+        logits, token_counts, row_slots, frequency_penalties)
+    return logits
+
+
+def apply_sparse_count_frequency_penalty(
+    logits: torch.Tensor,
+    token_ids: torch.Tensor,
+    token_counts: torch.Tensor,
+    count_lens: torch.Tensor,
+    row_slots: torch.Tensor,
+    frequency_penalties: torch.Tensor,
+) -> torch.Tensor:
+    torch.ops.trtllm.speculative_apply_sparse_count_frequency_penalty(
+        logits, token_ids, token_counts, count_lens, row_slots,
+        frequency_penalties)
+    return logits
+
+
+def append_accepted_tokens_to_history(
+    history_tokens: torch.Tensor,
+    history_lens: torch.Tensor,
+    seq_slots: torch.Tensor,
+    accepted_tokens: torch.Tensor,
+    accepted_lens: torch.Tensor,
+) -> None:
+    torch.ops.trtllm.speculative_append_accepted_tokens(
+        history_tokens, history_lens, seq_slots, accepted_tokens,
+        accepted_lens)
+
+
+def append_accepted_tokens_to_counts(
+    token_counts: torch.Tensor,
+    seq_slots: torch.Tensor,
+    accepted_tokens: torch.Tensor,
+    accepted_lens: torch.Tensor,
+) -> None:
+    torch.ops.trtllm.speculative_append_accepted_token_counts(
+        token_counts, seq_slots, accepted_tokens, accepted_lens)
+
+
+def append_accepted_tokens_to_sparse_counts(
+    token_ids: torch.Tensor,
+    token_counts: torch.Tensor,
+    count_lens: torch.Tensor,
+    seq_slots: torch.Tensor,
+    accepted_tokens: torch.Tensor,
+    accepted_lens: torch.Tensor,
+    vocab_size: int,
+) -> None:
+    torch.ops.trtllm.speculative_append_sparse_token_counts(
+        token_ids, token_counts, count_lens, seq_slots, accepted_tokens,
+        accepted_lens, vocab_size)
+
+
+def init_sparse_token_counts(
+    token_ids: torch.Tensor,
+    token_counts: torch.Tensor,
+    count_lens: torch.Tensor,
+    prompt_token_ids: torch.Tensor,
+    prompt_token_counts: torch.Tensor,
+    prompt_lens: torch.Tensor,
+    seq_slots: torch.Tensor,
+    vocab_size: int,
+) -> None:
+    torch.ops.trtllm.speculative_init_sparse_token_counts(
+        token_ids, token_counts, count_lens, prompt_token_ids,
+        prompt_token_counts, prompt_lens, seq_slots, vocab_size)
+
+
 @torch.compile(options={"max-autotune": True})
 def sampling_batch_spec_dec_one_model(
     logits: torch.Tensor,
diff --git a/tensorrt_llm/_torch/speculative/spec_sampler_base.py b/tensorrt_llm/_torch/speculative/spec_sampler_base.py
index a795a19d6d8..b6550a428c5 100644
--- a/tensorrt_llm/_torch/speculative/spec_sampler_base.py
+++ b/tensorrt_llm/_torch/speculative/spec_sampler_base.py
@@ -19,11 +19,13 @@
 Eagle3OneModelSampler.
 """
 
+import os
 from dataclasses import dataclass
 from typing import Optional
 
 import torch
 
+from tensorrt_llm.bindings.executor import FinishReason
 from tensorrt_llm.logger import logger
 
 from ..pyexecutor.llm_request import LlmRequest, LlmRequestState
@@ -99,10 +101,27 @@ def __init__(self, args: TorchSampler.Args, *, draft_len: int):
         self.max_seq_len = args.max_seq_len
 
         seq_slots = args.max_num_sequences
+        self.max_num_sequences = seq_slots
         max_tokens = self._get_max_tokens(args, draft_len)
         draft_tokens_size = self._get_draft_tokens_storage_size(args, draft_len)
         self.max_beam_width = args.max_beam_width
         assert self.max_beam_width == 1, "beam width must be 1 for speculative decoding"
+        self.loop_guard_tokens = int(
+            os.environ.get("TRTLLM_SPEC_LOOP_GUARD_TOKENS", "0"))
+        self.loop_guard_period = max(
+            int(os.environ.get("TRTLLM_SPEC_LOOP_GUARD_PERIOD", "64")), 1)
+        self.loop_guard_repeats = max(
+            int(os.environ.get("TRTLLM_SPEC_LOOP_GUARD_REPEATS", "4")), 2)
+        self.loop_guard_window = max(
+            int(os.environ.get("TRTLLM_SPEC_LOOP_GUARD_WINDOW", "1024")), 128)
+        self.count_debug_tokens = int(
+            os.environ.get("TRTLLM_SPEC_COUNT_DEBUG_TOKENS", "0"))
+        self.count_debug_period = max(
+            int(os.environ.get("TRTLLM_SPEC_COUNT_DEBUG_PERIOD", "256")), 1)
+        self.count_debug_max_rows = max(
+            int(os.environ.get("TRTLLM_SPEC_COUNT_DEBUG_MAX_ROWS", "4")), 1)
+        self.count_debug_path_logged = False
+        self.count_debug_order_logged = False
 
         self.store = self.Store(
             new_tokens=int_tensor((max_tokens, seq_slots, self.max_beam_width)),
@@ -137,6 +156,11 @@ def _add_dummy_draft_tokens(self) -> bool:
         """
         return True
 
+    def _is_valid_slot(self, request: LlmRequest) -> bool:
+        slot = request.py_seq_slot
+        return (slot is not None and 0 <= int(slot) < self.max_num_sequences
+                and not request.is_dummy)
+
     def _request_common_handling(
         self,
         request: LlmRequest,
@@ -161,9 +185,302 @@ def _request_common_handling(
                 "return_log_probs not supported with speculative decoding, skipping for request %s",
                 request.py_request_id,
             )
-        request.py_draft_tokens = next_draft_tokens[request.py_seq_slot][:runtime_draft_len]
+        request.py_draft_tokens = next_draft_tokens[
+            request.py_seq_slot][:runtime_draft_len]
         request.py_decoding_iter += 1
 
+    def _ordered_sampling_requests_from_outputs(
+            self,
+            fallback_requests: list[LlmRequest],
+            outputs: dict[str, torch.Tensor],
+            num_skip: int) -> list[LlmRequest]:
+        request_ids = outputs.get("penalty_sampling_request_ids")
+        if request_ids is None:
+            return fallback_requests
+
+        output_request_ids = [int(request_id) for request_id in request_ids]
+        output_seq_slots = outputs.get("penalty_sampling_seq_slots")
+        if output_seq_slots is not None:
+            output_seq_slots = [int(slot) for slot in output_seq_slots]
+
+        row_request_ids = output_request_ids[
+            num_skip:num_skip + len(fallback_requests)]
+        row_seq_slots = None
+        if output_seq_slots is not None:
+            row_seq_slots = output_seq_slots[
+                num_skip:num_skip + len(fallback_requests)]
+        if len(row_request_ids) != len(fallback_requests):
+            return fallback_requests
+
+        available = list(fallback_requests)
+        ordered_requests: list[LlmRequest] = []
+        for row, request_id in enumerate(row_request_ids):
+            row_slot = row_seq_slots[row] if row_seq_slots is not None else None
+            match_idx = None
+            for idx, request in enumerate(available):
+                if int(request.py_request_id) != request_id:
+                    continue
+                if row_slot is not None:
+                    slot = request.py_seq_slot
+                    request_slot = int(slot) if slot is not None else -1
+                    if request_slot != row_slot:
+                        continue
+                match_idx = idx
+                break
+            if match_idx is None and row_slot is not None:
+                for idx, request in enumerate(available):
+                    if int(request.py_request_id) == request_id:
+                        match_idx = idx
+                        break
+            if match_idx is None:
+                return fallback_requests
+            ordered_requests.append(available.pop(match_idx))
+
+        if (self.count_debug_tokens > 0 and not self.count_debug_order_logged):
+            fallback_head = [
+                (int(request.py_request_id),
+                 int(request.py_seq_slot) if request.py_seq_slot is not None else -1)
+                for request in fallback_requests[:16]
+            ]
+            ordered_head = [
+                (int(request.py_request_id),
+                 int(request.py_seq_slot) if request.py_seq_slot is not None else -1)
+                for request in ordered_requests[:16]
+            ]
+            if ordered_head != fallback_head:
+                self.count_debug_order_logged = True
+                logger.info(
+                    "Spec sampler remapped output row request order "
+                    "fallback_head=%s output_head=%s",
+                    fallback_head, ordered_head)
+
+        return ordered_requests
+
+    @staticmethod
+    def _prompt_len(request: LlmRequest) -> int:
+        for attr in ("py_orig_prompt_len", "orig_prompt_len", "py_prompt_len",
+                     "prompt_len"):
+            value = getattr(request, attr, None)
+            if value is not None:
+                return max(int(value), 0)
+        return 0
+
+    @staticmethod
+    def _has_repeated_suffix(tokens: list[int], repeats: int) -> bool:
+        if len(tokens) >= 256 and len(set(tokens[-256:])) <= 8:
+            return True
+        for ngram in (16, 8, 4):
+            needed = ngram * repeats
+            if len(tokens) < needed:
+                continue
+            suffix = tokens[-ngram:]
+            repeated = True
+            for i in range(2, repeats + 1):
+                start = -ngram * i
+                end = start + ngram
+                if tokens[start:end] != suffix:
+                    repeated = False
+                    break
+            if repeated:
+                return True
+        return False
+
+    def _has_repeated_window(self, tokens: list[int]) -> bool:
+        window = tokens[-self.loop_guard_window:]
+        if len(window) < 512:
+            return False
+        if len(set(window[-512:])) <= 32:
+            return True
+        for ngram, min_count in ((16, 3), (12, 4), (8, 8)):
+            if len(window) < ngram * min_count:
+                continue
+            counts: dict[tuple[int, ...], int] = {}
+            for start in range(0, len(window) - ngram + 1):
+                key = tuple(window[start:start + ngram])
+                count = counts.get(key, 0) + 1
+                if count >= min_count:
+                    return True
+                counts[key] = count
+        return False
+
+    def _maybe_finish_repetition_loop(self, request: LlmRequest, beam_idx: int,
+                                      prev_generated_len: int,
+                                      generated_len: int) -> bool:
+        if self.loop_guard_tokens <= 0:
+            return False
+        if generated_len < self.loop_guard_tokens:
+            return False
+        if (prev_generated_len >= self.loop_guard_tokens
+                and (generated_len // self.loop_guard_period
+                     == prev_generated_len // self.loop_guard_period)):
+            return False
+        tokens = request.get_tokens(beam_idx)
+        generated_tokens = tokens[-generated_len:]
+        if (not self._has_repeated_suffix(generated_tokens,
+                                          self.loop_guard_repeats)
+                and not self._has_repeated_window(generated_tokens)):
+            return False
+
+        logger.warning(
+            "Speculative decoding repetition guard stopped request_id=%s "
+            "generated_len=%s",
+            request.py_request_id, generated_len)
+        request.finish_by(FinishReason.STOP_WORDS, beam_idx)
+        return True
+
+    def _should_debug_count_row(self, request: LlmRequest,
+                                accepted_len: int) -> tuple[bool, int, int]:
+        if self.count_debug_tokens <= 0 or accepted_len <= 0:
+            return False, 0, 0
+        prompt_len = self._prompt_len(request)
+        prev_generated_len = max(len(request.get_tokens(0)) - prompt_len, 0)
+        next_generated_len = prev_generated_len + accepted_len
+        if next_generated_len < self.count_debug_tokens:
+            return False, prev_generated_len, next_generated_len
+        prev_bucket = max(prev_generated_len - self.count_debug_tokens,
+                          0) // self.count_debug_period
+        next_bucket = max(next_generated_len - self.count_debug_tokens,
+                          0) // self.count_debug_period
+        return (prev_generated_len < self.count_debug_tokens
+                or next_bucket != prev_bucket), prev_generated_len, next_generated_len
+
+    def _maybe_log_count_debug_path(self, outputs: dict[str, torch.Tensor]) -> None:
+        if self.count_debug_tokens <= 0 or self.count_debug_path_logged:
+            return
+        self.count_debug_path_logged = True
+        logger.info(
+            "Spec count debug path env_mode=%s dense=%s sparse=%s "
+            "history=%s history_appended=%s",
+            os.environ.get("TRTLLM_SPEC_COUNT_MODE", ""),
+            outputs.get("penalty_token_counts") is not None,
+            (outputs.get("penalty_sparse_token_ids") is not None
+             and outputs.get("penalty_sparse_token_counts") is not None
+             and outputs.get("penalty_sparse_count_lens") is not None),
+            (outputs.get("penalty_history_tokens") is not None
+             and outputs.get("penalty_history_lens") is not None),
+            outputs.get("penalty_history_appended", False))
+
+    def _debug_dense_counts_before_append(
+            self,
+            sampling_requests: list[LlmRequest],
+            penalty_slot_values: list[int],
+            o_new_tokens: torch.Tensor,
+            o_new_tokens_lens: torch.Tensor,
+            token_counts: torch.Tensor,
+    ) -> None:
+        if self.count_debug_tokens <= 0:
+            return
+
+        accepted_lens = o_new_tokens_lens.detach().cpu().tolist()
+        rows_logged = 0
+        for row, request in enumerate(sampling_requests):
+            if rows_logged >= self.count_debug_max_rows:
+                return
+            if row >= len(penalty_slot_values):
+                continue
+            slot = penalty_slot_values[row]
+            if slot < 0:
+                continue
+            accepted_len = int(accepted_lens[row])
+            should_log, prev_generated_len, next_generated_len = (
+                self._should_debug_count_row(request, accepted_len))
+            if not should_log:
+                continue
+
+            accepted_tokens = o_new_tokens[row, :accepted_len].detach().cpu().tolist()
+            history_tail = [
+                int(token) for token in request.get_tokens(0)[-16:]
+                if int(token) >= 0
+            ]
+            interesting_tokens = list(dict.fromkeys(
+                [int(token) for token in accepted_tokens] + history_tail))
+            if interesting_tokens:
+                interesting_tensor = torch.tensor(
+                    interesting_tokens,
+                    dtype=torch.long,
+                    device=token_counts.device)
+                interesting_counts = token_counts[slot].index_select(
+                    0, interesting_tensor).detach().cpu().tolist()
+                count_by_token = {
+                    token: int(count)
+                    for token, count in zip(interesting_tokens,
+                                            interesting_counts)
+                }
+            else:
+                count_by_token = {}
+            accepted_counts = [(int(token), count_by_token.get(int(token), 0))
+                               for token in accepted_tokens]
+            history_tail_counts = [(token, count_by_token.get(token, 0))
+                                   for token in history_tail]
+
+            logger.info(
+                "Spec dense count debug before append request_id=%s slot=%s "
+                "prev_generated_len=%s next_generated_len=%s accepted_len=%s "
+                "accepted_counts=%s history_tail_counts=%s",
+                request.py_request_id, slot, prev_generated_len,
+                next_generated_len, accepted_len, accepted_counts,
+                history_tail_counts)
+            rows_logged += 1
+
+    def _debug_sparse_counts_before_append(
+            self,
+            sampling_requests: list[LlmRequest],
+            penalty_slot_values: list[int],
+            o_new_tokens: torch.Tensor,
+            o_new_tokens_lens: torch.Tensor,
+            token_ids: torch.Tensor,
+            token_counts: torch.Tensor,
+            count_lens: torch.Tensor,
+            count_vocab_size: int,
+    ) -> None:
+        if self.count_debug_tokens <= 0:
+            return
+
+        accepted_lens = o_new_tokens_lens.detach().cpu().tolist()
+        rows_logged = 0
+        for row, request in enumerate(sampling_requests):
+            if rows_logged >= self.count_debug_max_rows:
+                return
+            if row >= len(penalty_slot_values):
+                continue
+            slot = penalty_slot_values[row]
+            if slot < 0:
+                continue
+            accepted_len = int(accepted_lens[row])
+            should_log, prev_generated_len, next_generated_len = (
+                self._should_debug_count_row(request, accepted_len))
+            if not should_log:
+                continue
+
+            count_len = int(count_lens[slot].detach().cpu().item())
+            count_len = min(max(count_len, 0), int(token_ids.shape[1]))
+            sparse_ids = token_ids[slot, :count_len].detach().cpu().tolist()
+            sparse_counts = token_counts[slot, :count_len].detach().cpu().tolist()
+            count_by_token = {
+                int(token): int(count)
+                for token, count in zip(sparse_ids, sparse_counts)
+            }
+
+            accepted_tokens = o_new_tokens[row, :accepted_len].detach().cpu().tolist()
+            accepted_counts = [(int(token), count_by_token.get(int(token), 0))
+                               for token in accepted_tokens]
+            history_tail = [
+                int(token) for token in request.get_tokens(0)[-16:]
+                if int(token) >= 0
+            ]
+            history_tail_counts = [(token, count_by_token.get(token, 0))
+                                   for token in history_tail]
+
+            logger.info(
+                "Spec sparse count debug before append request_id=%s slot=%s "
+                "prev_generated_len=%s next_generated_len=%s accepted_len=%s "
+                "count_len=%s count_vocab_size=%s accepted_counts=%s "
+                "history_tail_counts=%s",
+                request.py_request_id, slot, prev_generated_len,
+                next_generated_len, accepted_len, count_len, count_vocab_size,
+                accepted_counts, history_tail_counts)
+            rows_logged += 1
+
     def update_requests(
         self,
         state: SampleStateSpec,
@@ -196,6 +513,15 @@ def update_requests(
                     req, new_token, max_seq_len=self.max_seq_len, beam_idx=beam_idx
                 ):
                     break
+            if (self.loop_guard_tokens > 0
+                    and req.state != LlmRequestState.GENERATION_COMPLETE):
+                guard_len_attr = "_spec_loop_guard_generated_len"
+                prev_generated_len = getattr(req, guard_len_attr, 0)
+                generated_len = prev_generated_len + num_new_tokens
+                setattr(req, guard_len_attr, generated_len)
+                self._maybe_finish_repetition_loop(req, beam_idx,
+                                                   prev_generated_len,
+                                                   generated_len)
             req.py_num_accepted_draft_tokens = num_new_tokens - 1
             req.py_rewind_len = runtime_draft_len - req.py_num_accepted_draft_tokens
             self._request_common_handling(req, next_draft_tokens_list, runtime_draft_len)
@@ -224,11 +550,53 @@ def sample_async(
         """
         num_skip = len(scheduled_requests.context_requests_chunking)
         finished_context_requests = scheduled_requests.context_requests_last_chunk
-        sampling_requests = finished_context_requests + scheduled_requests.generation_requests
+        fallback_sampling_requests = (
+            finished_context_requests + scheduled_requests.generation_requests)
+        sampling_requests = self._ordered_sampling_requests_from_outputs(
+            fallback_sampling_requests, outputs, num_skip)
         num_sampling_requests = len(sampling_requests)
 
-        slots = torch.as_tensor([r.py_seq_slot for r in sampling_requests], dtype=torch.long)
+        valid_positions: list[int] = []
+        valid_slot_values: list[int] = []
+        penalty_slot_values: list[int] = []
+        valid_sampling_requests: list[LlmRequest] = []
+        for pos, request in enumerate(sampling_requests):
+            if self._is_valid_slot(request):
+                slot = int(request.py_seq_slot)
+                valid_positions.append(pos)
+                valid_slot_values.append(slot)
+                penalty_slot_values.append(slot)
+                valid_sampling_requests.append(request)
+            else:
+                penalty_slot_values.append(-1)
+
+        slots = torch.as_tensor(valid_slot_values, dtype=torch.long)
         slots = slots.to(device="cuda", non_blocking=True)
+        penalty_slots: Optional[torch.Tensor] = None
+
+        def fallback_penalty_slots() -> torch.Tensor:
+            nonlocal penalty_slots
+            if penalty_slots is None:
+                penalty_slots = torch.as_tensor(penalty_slot_values,
+                                                dtype=torch.int32)
+                penalty_slots = penalty_slots.to(device="cuda",
+                                                 non_blocking=True)
+            return penalty_slots
+
+        def output_slot_slice(name: str) -> Optional[torch.Tensor]:
+            seq_slots = outputs.get(name)
+            if seq_slots is None:
+                return None
+            end = num_skip + num_sampling_requests
+            if seq_slots.numel() < end:
+                return None
+            return seq_slots[num_skip:end].contiguous()
+
+        count_penalty_slots = output_slot_slice("penalty_count_seq_slots")
+        history_penalty_slots = output_slot_slice("penalty_history_seq_slots")
+        valid_positions_cuda = torch.as_tensor(valid_positions,
+                                               dtype=torch.long,
+                                               device="cuda")
 
         o_new_tokens = outputs["new_tokens"][num_skip : num_skip + num_sampling_requests]
         o_new_tokens_lens = outputs["new_tokens_lens"][num_skip : num_skip + num_sampling_requests]
@@ -238,6 +606,59 @@ def sample_async(
         o_next_new_tokens = outputs["next_new_tokens"][num_skip : num_skip + num_sampling_requests]
         runtime_draft_len = o_next_draft_tokens.shape[1]
 
+        self._maybe_log_count_debug_path(outputs)
+
+        penalty_token_counts = outputs.get("penalty_token_counts")
+        if penalty_token_counts is not None:
+            from .one_model_sampler import append_accepted_tokens_to_counts
+            self._debug_dense_counts_before_append(
+                sampling_requests, penalty_slot_values, o_new_tokens,
+                o_new_tokens_lens, penalty_token_counts)
+            append_accepted_tokens_to_counts(
+                penalty_token_counts,
+                count_penalty_slots
+                if count_penalty_slots is not None else fallback_penalty_slots(),
+                o_new_tokens.contiguous(),
+                o_new_tokens_lens.contiguous())
+        elif (outputs.get("penalty_sparse_token_ids") is not None
+              and outputs.get("penalty_sparse_token_counts") is not None
+              and outputs.get("penalty_sparse_count_lens") is not None):
+            from .one_model_sampler import append_accepted_tokens_to_sparse_counts
+            self._debug_sparse_counts_before_append(
+                sampling_requests, penalty_slot_values, o_new_tokens,
+                o_new_tokens_lens, outputs["penalty_sparse_token_ids"],
+                outputs["penalty_sparse_token_counts"],
+                outputs["penalty_sparse_count_lens"],
+                int(outputs.get("penalty_count_vocab_size", 0)))
+            append_accepted_tokens_to_sparse_counts(
+                outputs["penalty_sparse_token_ids"],
+                outputs["penalty_sparse_token_counts"],
+                outputs["penalty_sparse_count_lens"],
+                count_penalty_slots
+                if count_penalty_slots is not None else fallback_penalty_slots(),
+                o_new_tokens.contiguous(),
+                o_new_tokens_lens.contiguous(),
+                int(outputs.get("penalty_count_vocab_size", 0)))
+
+        penalty_history_tokens = outputs.get("penalty_history_tokens")
+        penalty_history_lens = outputs.get("penalty_history_lens")
+        if (penalty_history_tokens is not None
+                and penalty_history_lens is not None
+                and not outputs.get("penalty_history_appended", False)):
+            from .one_model_sampler import append_accepted_tokens_to_history
+            append_accepted_tokens_to_history(
+                penalty_history_tokens,
+                penalty_history_lens,
+                history_penalty_slots
+                if history_penalty_slots is not None else fallback_penalty_slots(),
+                o_new_tokens.contiguous(),
+                o_new_tokens_lens.contiguous())
+
+        o_new_tokens = o_new_tokens.index_select(0, valid_positions_cuda)
+        o_new_tokens_lens = o_new_tokens_lens.index_select(0, valid_positions_cuda)
+        o_next_draft_tokens = o_next_draft_tokens.index_select(0, valid_positions_cuda)
+        o_next_new_tokens = o_next_new_tokens.index_select(0, valid_positions_cuda)
+
         # Pad to match fixed-size store buffers for index_copy_.
         if o_new_tokens.shape[1] < (self.draft_len + 1):
             o_new_tokens = torch.nn.functional.pad(
@@ -275,10 +696,11 @@ def sample_async(
         # Add dummy draft tokens to context requests for KV cache preparation
         if self._add_dummy_draft_tokens():
             for request in finished_context_requests:
-                request.py_draft_tokens = [1] * self.draft_len
+                if self._is_valid_slot(request):
+                    request.py_draft_tokens = [1] * self.draft_len
 
         return SampleStateSpec(
-            requests=sampling_requests,
+            requests=valid_sampling_requests,
             device=device_tensors,
             host=host_tensors,
             sampler_event=sampler_event,
diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py
index 33f37706399..0e14ace483b 100644
--- a/tensorrt_llm/_torch/speculative/utils.py
+++ b/tensorrt_llm/_torch/speculative/utils.py
@@ -34,7 +34,8 @@ def get_spec_metadata(spec_config,
                       max_num_tokens,
                       spec_resource_manager=None,
                       is_draft_model=False,
-                      max_seq_len=262144):
+                      max_seq_len=262144,
+                      max_num_sequence_slots: Optional[int] = None):
     if spec_config.spec_dec_mode.is_mtp_one_model():
         return MTPSpecMetadata(
             max_draft_len=spec_config.max_draft_len,
@@ -42,6 +43,7 @@ def get_spec_metadata(spec_config,
             spec_dec_mode=spec_config.spec_dec_mode,
             mtp_num_modules=spec_config.num_nextn_predict_layers,
             max_num_requests=max_num_requests,
+            max_num_sequence_slots=max_num_sequence_slots,
             mtp_hidden_states_manager=spec_resource_manager,
             allow_advanced_sampling=spec_config.allow_advanced_sampling,
         )
@@ -51,6 +53,7 @@ def get_spec_metadata(spec_config,
             max_total_draft_tokens=spec_config.tokens_per_gen_step - 1,
             spec_dec_mode=spec_config.spec_dec_mode,
             max_num_requests=max_num_requests,
+            max_num_sequence_slots=max_num_sequence_slots,
             num_layers=model_config.num_hidden_layers,
             hidden_size=model_config.hidden_size,
             max_num_tokens=max_num_tokens,
@@ -66,6 +69,7 @@ def get_spec_metadata(spec_config,
             max_total_draft_tokens=spec_config.tokens_per_gen_step - 1,
             spec_dec_mode=spec_config.spec_dec_mode,
             max_num_requests=max_num_requests,
+            max_num_sequence_slots=max_num_sequence_slots,
             num_layers=model_config.num_hidden_layers,
             hidden_size=model_config.hidden_size,
             max_num_tokens=max_num_tokens,
@@ -85,6 +89,7 @@ def get_spec_metadata(spec_config,
             max_total_draft_tokens=spec_config.tokens_per_gen_step - 1,
             spec_dec_mode=spec_config.spec_dec_mode,
             max_num_requests=max_num_requests,
+            max_num_sequence_slots=max_num_sequence_slots,
             num_layers=model_config.num_hidden_layers,
             hidden_size=model_config.hidden_size,
             max_num_tokens=max_num_tokens,
@@ -98,6 +103,7 @@ def get_spec_metadata(spec_config,
             max_total_draft_tokens=spec_config.tokens_per_gen_step - 1,
             spec_dec_mode=spec_config.spec_dec_mode,
             max_num_requests=max_num_requests,
+            max_num_sequence_slots=max_num_sequence_slots,
             allow_advanced_sampling=spec_config.allow_advanced_sampling,
             spec_resource_manager=spec_resource_manager,
         )
@@ -107,6 +113,7 @@ def get_spec_metadata(spec_config,
             max_total_draft_tokens=spec_config.max_total_draft_tokens,
             spec_dec_mode=spec_config.spec_dec_mode,
             max_num_requests=max_num_requests,
+            max_num_sequence_slots=max_num_sequence_slots,
             max_num_tokens=max_num_tokens,
             allow_advanced_sampling=spec_config.allow_advanced_sampling,
         )
@@ -116,6 +123,7 @@ def get_spec_metadata(spec_config,
             max_total_draft_tokens=1,
             spec_dec_mode=spec_config.spec_dec_mode,
             max_num_requests=max_num_requests,
+            max_num_sequence_slots=max_num_sequence_slots,
             num_model_layers=model_config.num_hidden_layers,
             hidden_size=model_config.hidden_size,
             max_num_tokens=max_num_tokens,
@@ -129,6 +137,7 @@ def get_spec_metadata(spec_config,
             max_total_draft_tokens=spec_config.max_total_draft_tokens,
             spec_dec_mode=spec_config.spec_dec_mode,
             max_num_requests=max_num_requests,
+            max_num_sequence_slots=max_num_sequence_slots,
             sa_manager=spec_resource_manager,
             max_matching_ngram_size=spec_config.max_matching_ngram_size,
         )
@@ -140,6 +149,7 @@ def get_spec_metadata(spec_config,
             max_total_draft_tokens=spec_config.tokens_per_gen_step - 1,
             spec_dec_mode=spec_config.spec_dec_mode,
             max_num_requests=max_num_requests,
+            max_num_sequence_slots=max_num_sequence_slots,
         )
     return None