From 118b6f05745a263c29140bbf64282d1c7e9d8983 Mon Sep 17 00:00:00 2001 From: dongfengy <99041270+dongfengy@users.noreply.github.com> Date: Tue, 26 May 2026 12:37:11 -0700 Subject: [PATCH 1/2] [https://nvbugs/6168859][fix] move tinygemm PDL release after reduction (#14537) Signed-off-by: Dongfeng Yu --- .../kernels/tinygemm2/tinygemm2_kernel.cuh | 6 +- tests/integration/test_lists/waives.txt | 786 +++++++++--------- 2 files changed, 402 insertions(+), 390 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh index ca95f6849bc..4592fe1d110 100644 --- a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh +++ b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh @@ -236,7 +236,6 @@ __global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output, if (!weight_warp) { cudaGridDependencySynchronize(); - cudaTriggerProgrammaticLaunchCompletion(); } for (int ki = 0; ki < K_LOOPS_DMA; ki++) @@ -422,6 +421,11 @@ __global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output, __syncthreads(); + if (threadIdx.x == 0) // one thread per block suffices according to official code examples + { + cudaTriggerProgrammaticLaunchCompletion(); + } + if (warp_id == 0) { diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 686b6211843..b26944b57de 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -1,413 +1,421 @@ -examples/test_openai.py::test_llm_openai_triton_1gpu SKIP (https://nvbugspro.nvidia.com/bug/4963654) -examples/test_openai.py::test_llm_openai_triton_plugingen_1gpu SKIP (https://nvbugspro.nvidia.com/bug/4963654) -full:GH200/examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (arm is not supported) -full:GH200/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (arm is not supported) -perf/test_perf.py::test_perf[t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) -perf/test_perf.py::test_perf[flan_t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) -perf/test_perf.py::test_perf[bart_large_cnn-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) -full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514) -perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:512,200-quant:fp8-tp:4] SKIP (SKIP due to timeout of quantization) -perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-quant:fp8-gpus:8] SKIP (SKIP due to timeout of quantization) +accuracy/test_cli_flow.py::TestGptNext::test_auto_dtype SKIP (https://nvbugs/6162940) +accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/6120535) +accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4] SKIP (https://nvbugs/6189918) +accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2] SKIP (https://nvbugs/6189918) +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_kv_cache_v2_nixl_python SKIP (https://nvbugs/6184575) +accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] SKIP (https://nvbugs/6117811) +accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/6117811) +accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_kv_cache_v2_nixl_python SKIP (https://nvbugs/6117811) +accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664) +accuracy/test_dwdp_disaggregated_serving.py::TestDwdpDeepSeekV3Lite::test_dwdp_accuracy SKIP (https://nvbugs/6094102) +accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5772995) +accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] SKIP (https://nvbugs/5346443) +accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197) +accuracy/test_llm_api_autodeploy.py::TestGemma4MoE::test_bf16 SKIP (https://nvbugs/6158397) +accuracy/test_llm_api_autodeploy.py::TestGemmaE2B::test_gemma4_e2b_it SKIP (https://nvbugs/6194934) +accuracy/test_llm_api_autodeploy.py::TestMiniMaxM2::test_finegrained_fp8 SKIP (https://nvbugs/6158397) +accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[nvfp4-1-trtllm] SKIP (https://nvbugs/6200112) +accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_functional_small[bf16] SKIP (https://nvbugs/6162114) +accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_functional_small[fp8] SKIP (https://nvbugs/6162114) +accuracy/test_llm_api_autodeploy.py::TestNemotronUltraV3::test_accuracy[nvfp4-8] SKIP (https://nvbugs/6215690) +accuracy/test_llm_api_autodeploy.py::TestQwen3_5_397B_MoE::test_bf16_small[4] SKIP (https://nvbugs/6158397) +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/6215736) +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] SKIP (https://nvbugs/6084775) +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/6029882) +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/6018046) +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] SKIP (https://nvbugs/6215793) +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus SKIP (https://nvbugs/6193778) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_dsa_host_cache_offload[host_cache_offload] SKIP (https://nvbugs/6185196) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_dsa_host_cache_offload[host_cache_offload_mtp1] SKIP (https://nvbugs/6185196) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_dsa_host_cache_offload[host_cache_offload_mtp3_no_adp] SKIP (https://nvbugs/6185196) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] SKIP (https://nvbugs/6185196) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] SKIP (https://nvbugs/5955792) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[disable_skip_indexer] SKIP (https://nvbugs/5859886) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency_default] SKIP (https://nvbugs/6185196) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] SKIP (https://nvbugs/6185196) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] SKIP (https://nvbugs/6185196) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_piecewise_cuda_graph[baseline] SKIP (https://nvbugs/6185196) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_piecewise_cuda_graph[mtp3_fp8kv_chunked] SKIP (https://nvbugs/5989920) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6084720) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6095851) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/6162860) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6050489) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6162115) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6050489) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6050489) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/6162115) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/6162115) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6198785) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_python_scheduler[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-enable_chunked_prefill=True] SKIP (https://nvbugs/6071081) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_nvfp4_4gpus[tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6185146) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6195110) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6162115) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6162115) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6112497) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6162122) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6084447) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6084447) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6084447) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=True] SKIP (https://nvbugs/6084447) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6084447) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=True] SKIP (https://nvbugs/6084447) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6084447) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=0] SKIP (https://nvbugs/6084447) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=2] SKIP (https://nvbugs/6084447) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=0] SKIP (https://nvbugs/6162122) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=2] SKIP (https://nvbugs/6162122) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=0] SKIP (https://nvbugs/6162122) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/6162122) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6162122) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6162115) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6162115) +accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype SKIP (https://nvbugs/6164924) +accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_2_model_mtp[2model] SKIP (https://nvbugs/5981293) +accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5981293) +accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput] SKIP (https://nvbugs/5981293) +accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput_trtllm] SKIP (https://nvbugs/5981293) +accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model] SKIP (https://nvbugs/5772993) +accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5772360) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_dflash SKIP (https://nvbugs/6156233) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v1_kv_cache-trtllm-one_model-no_overlap_scheduler] SKIP (https://nvbugs/6220815) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-cutlass-two_model-no_overlap_scheduler] SKIP (https://nvbugs/6211880) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-trtllm-one_model-overlap_scheduler] SKIP (https://nvbugs/6215702) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model] SKIP (https://nvbugs/5596343) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model] SKIP (https://nvbugs/5596343) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto] SKIP (https://nvbugs/6026676) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/6153955) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-cutlass-auto] SKIP (https://nvbugs/5596343) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto] SKIP (https://nvbugs/5596343) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-fp8] SKIP (https://nvbugs/5651865) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5596343) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/6109750) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-cutlass-auto] SKIP (https://nvbugs/5596343) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-cutlass-auto] SKIP (https://nvbugs/5596343) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] SKIP (https://nvbugs/5596343) +accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype SKIP (https://nvbugs/6209806) +accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4_longseq_trtllm_moe_async_cancel SKIP (https://nvbugs/6160085) +accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4_longseq_trtllm_moe_stress SKIP (https://nvbugs/6160085) +accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_bf16 SKIP (https://nvbugs/6211185) +accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_fp8 SKIP (https://nvbugs/6211185) +accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_nvfp4 SKIP (https://nvbugs/6211185) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5616182) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[use_temperature=False-attn_backend=TRTLLM] SKIP (https://nvbugs/5997547) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_dflash SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/6050489) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler SKIP (https://nvbugs/6112497) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] SKIP (https://nvbugs/6076767) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[llguidance-eagle3_one_model=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[llguidance-eagle3_one_model=True] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=False] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=True] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[llguidance] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[xgrammar] SKIP (https://nvbugs/6141653) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard[overlap_scheduler=False] SKIP (https://nvbugs/6162114) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard[overlap_scheduler=True] SKIP (https://nvbugs/6162114) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard_sa SKIP (https://nvbugs/6162114) +accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype SKIP (https://nvbugs/5520319) +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False-enable_gemm_allreduce_fusion=False] SKIP (https://nvbugs/6162122) +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False-enable_gemm_allreduce_fusion=True] SKIP (https://nvbugs/6162122) +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] SKIP (https://nvbugs/5775326) +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True] SKIP (https://nvbugs/5821415) +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True] SKIP (https://nvbugs/5821415) +accuracy/test_llm_api_pytorch.py::TestMiniMaxM2::test_4gpus[attention_dp=False-cuda_graph=True-overlap_scheduler=True-tp_size=4-ep_size=4] SKIP (https://nvbugs/6159132) +accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] SKIP (https://nvbugs/6163033) +accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm] SKIP (https://nvbugs/6162121) +accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm_eagle] SKIP (https://nvbugs/6157892) +accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus_mtp SKIP (https://nvbugs/6211693) +accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype SKIP (https://nvbugs/6076767) +accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap_adp_on] SKIP (https://nvbugs/6094068) +accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] SKIP (https://nvbugs/6116088) +accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4_adp_on-trtllm] SKIP (https://nvbugs/6094068) +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] SKIP (https://nvbugs/6177390) +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/6177390) +accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=False] SKIP (https://nvbugs/6212252) +accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=True] SKIP (https://nvbugs/6210714) +accuracy/test_llm_api_pytorch.py::TestQwen3_5_9B::test_bf16[mtp_off] SKIP (https://nvbugs/6212250) +accuracy/test_llm_api_pytorch.py::TestQwen3_5_9B::test_bf16[mtp_on] SKIP (https://nvbugs/6212250) +accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales_early_first_token_response SKIP (https://nvbugs/6200128) +accuracy/test_llm_api_pytorch_multimodal.py::TestExaone4_5_33B::test_auto_dtype[forced_chunked_prefill] SKIP (https://nvbugs/6211189) +accuracy/test_llm_api_pytorch_multimodal.py::TestExaone4_5_33B::test_auto_dtype[full_budget] SKIP (https://nvbugs/6211189) +accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized SKIP (https://nvbugs/6189416) +accuracy/test_llm_api_pytorch_multimodal.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm] SKIP (https://nvbugs/6181383) +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL::test_auto_dtype[forced_chunked_prefill] SKIP (https://nvbugs/6143787) +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype SKIP (https://nvbugs/6114464) +accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray SKIP (https://nvbugs/6094070) +cpp/test_e2e.py::test_benchmarks[bart-90] SKIP (https://nvbugs/5550689) +cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5550689) +cpp/test_e2e.py::test_model[-bart-90] SKIP (https://nvbugs/6162804) cpp/test_e2e.py::test_model[-encoder-90] SKIP (waive Encoder-only test because it doesn't take batched input) -full:GH200/unittest/trt/model_api/test_model_quantization.py SKIP (https://nvbugspro.nvidia.com/bug/4979955) +cpp/test_e2e.py::test_model[-gpt-80] SKIP (https://nvbugs/5983283) +cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665) +cpp/test_e2e.py::test_model[-redrafter-86] SKIP (https://nvbugs/5761642) +cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941) +cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-nixl_kvcache-90] SKIP (https://nvbugs/6093820) +cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-ucx_kvcache-90] SKIP (https://nvbugs/6093820) +cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199) +disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) +disaggregated/test_disaggregated.py::test_disaggregated_cancel_large_context_requests[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6105768) +disaggregated/test_disaggregated.py::test_disaggregated_chat_completion_tool_calls[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) +disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2_genpp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6114140) +disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6184906) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_conditional[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162324) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_gen_only[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap_cuda_graph[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxtp2ep2pp2_gentp4_one_mtp_block_reuse[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxtp2ep2pp2_gentp4_one_mtp_block_reuse_long_prompt[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_overlap_cuda_graph[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_attention_dp_overlap_one_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_diff_max_tokens[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) +disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6162322) +disaggregated/test_disaggregated.py::test_disaggregated_kv_cache_time_output[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6184906) +disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) +disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) +disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) +disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_trt_backend[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) +disaggregated/test_disaggregated.py::test_disaggregated_perf_metrics[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) +disaggregated/test_disaggregated.py::test_disaggregated_single_gpu[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6184906) +disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_trt_backend[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) +disaggregated/test_workers.py::test_workers_conditional_disaggregation_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6094100) +disaggregated/test_workers.py::test_workers_conversation_router[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6162322) +disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) +disaggregated/test_workers.py::test_workers_kv_cache_aware_router_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322) +disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6162322) +disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6114139) +examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058) +examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5234058) +examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5234058) +examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1] SKIP (https://nvbugs/5546507) +examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] SKIP (https://nvbugs/5546507) +examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8] SKIP (https://nvbugs/5546507) +examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] SKIP (https://nvbugs/5606233) +examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5431132) +examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979) +examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979) +examples/test_llama.py::test_llm_llama_1gpu_fp8_kv_cache[llama-v2-7b-hf-bfloat16] SKIP (https://nvbugs/5527940) +examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5802248) +examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5333849) +examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5222697) +examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818) +examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818) +examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818) examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5014327) -examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4866931) examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (https://nvbugs/4961624) -full:sm100/unittest/trt/functional SKIP (Disable for Blackwell) -full:sm100/unittest/trt/quantization SKIP (Disable for Blackwell) -full:sm100/unittest/trt/attention/test_bert_attention.py SKIP (Disable for Blackwell) -full:sm100/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell) -full:sm100/unittest/bindings SKIP (Disable for Blackwell) -full:sm100/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell) -full:sm100/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell) -full:sm100/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell) -full:sm100/unittest/trt/model/test_gpt.py -k "partition0" SKIP (Disable for Blackwell) -full:sm100/unittest/test_model_runner_cpp.py SKIP (Disable for Blackwell) -full:sm100/unittest/llmapi/test_llm_models.py -m "part0" SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 80/96) -full:sm100/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (megatron-core 0.8 is not supported in python 3.12) -full:sm100/unittest/llmapi/test_llm_models.py -m "not (part0 or part1)" SKIP (Disable for Blackwell OOM) -full:sm100/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (megatron-core 0.8 is not supported in python 3.12) -full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugspro.nvidia.com/bug/5161074) -full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugspro.nvidia.com/bug/5161074) -full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugspro.nvidia.com/bug/5161074) -full:B200/perf/test_perf.py::test_perf[bart_large_cnn] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[bert_large] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[flan_t5_base] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[flan_t5_large] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[flan_t5_xl] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[flan_t5_xxl] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[mbart_large_50_many_to_one_mmt] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[roberta_base] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[t5_11b] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[t5_3b] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[t5_base] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[t5_large] SKIP (bert_attention_plugin does not support SM >= 100) -full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugspro.nvidia.com/bug/5161074) -full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugspro.nvidia.com/bug/5161074) -full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugspro.nvidia.com/bug/5161074) +examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] SKIP (https://nvbugs/5444636) +examples/test_nemotron_nas.py::test_nemotron_nas_summary_2gpu[DeciLM-7B] SKIP (https://nvbugs/5444636) +examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] SKIP (https://nvbugs/5612313) +examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (https://nvbugs/5447530) +examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5612502) examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5174573) +examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5419070) +examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5401233) +examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5232405) examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-disable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221) examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221) examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221) -examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5222697) -examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] SKIP (https://nvbugspro.nvidia.com/bug/5226339) -perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20] SKIP # https://nvbugspro.nvidia.com/bug/5207477 +examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233) +examples/test_visual_gen.py::test_flux1_lpips_against_golden SKIP (https://nvbugs/6215688) +examples/test_visual_gen.py::test_flux2_lpips_against_golden SKIP (https://nvbugs/6215688) +examples/test_visual_gen.py::test_ltx2_lpips_against_golden SKIP (https://nvbugs/6215688) +examples/test_visual_gen.py::test_wan21_t2v_lpips_against_golden SKIP (https://nvbugs/6215688) +examples/test_visual_gen.py::test_wan22_t2v_lpips_against_golden SKIP (https://nvbugs/6215688) +examples/test_visual_gen.py::test_wan_t2v_example SKIP (https://nvbugs/6215688) +examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570) +full:A10/unittest/kv_cache_manager_v2_tests/ SKIP (https://nvbugs/5841954) +full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-False] SKIP (https://nvbugs/6185480) +full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-True] SKIP (https://nvbugs/6185480) +full:B200/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/6185150) +full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugs/5161074) +full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugs/5161074) +full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugs/5161074) +full:B300/unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "TRTLLM" SKIP (https://nvbugs/6165866) +full:DGX_H100/kv_cache/test_prefix_aware_scheduling.py::TestServePrefixAwareScheduling::test_multi_round_qa_shared_prefix[swa-chunked] SKIP (https://nvbugs/6136737) +full:GB200-OCI/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/6185150) +full:GB200/perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6194788) +full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514) +full:GH200/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (arm is not supported) +full:GH200/examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (arm is not supported) +full:GH200/unittest/trt/model_api/test_model_quantization.py SKIP (https://nvbugs/4979955) +full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551) +full:H20/accuracy/test_llm_api_autodeploy.py::TestNemotronV2::test_auto_dtype[False] SKIP (https://nvbugs/6185173) +full:H20/accuracy/test_llm_api_autodeploy.py::TestNemotronV2::test_auto_dtype[True] SKIP (https://nvbugs/6185173) +full:H20/accuracy/test_llm_api_autodeploy.py::TestNemotronV2::test_fp8[True] SKIP (https://nvbugs/6185173) +full:H20/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto] SKIP (https://nvbugs/6026676) +full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696) +full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5948435) +full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/5961814) +full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5961814) +full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5929339) +full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/6128419) +full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugs/5161074) +full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugs/5161074) +full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugs/5161074) +full:sm100/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (megatron-core 0.8 is not supported in python 3.12) +full:sm100/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (megatron-core 0.8 is not supported in python 3.12) +full:sm100/unittest/bindings SKIP (Disable for Blackwell) +full:sm100/unittest/llmapi/test_llm_models.py -m "not (part0 or part1)" SKIP (Disable for Blackwell OOM) +full:sm100/unittest/llmapi/test_llm_models.py -m "part0" SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 80/96) +full:sm100/unittest/test_model_runner_cpp.py SKIP (Disable for Blackwell) +full:sm100/unittest/trt/attention/test_bert_attention.py SKIP (Disable for Blackwell) +full:sm100/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell) +full:sm100/unittest/trt/functional SKIP (Disable for Blackwell) +full:sm100/unittest/trt/model/test_gpt.py -k "partition0" SKIP (Disable for Blackwell) +full:sm100/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell) +full:sm100/unittest/trt/quantization SKIP (Disable for Blackwell) +full:sm100/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell) +full:sm100/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell) +llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_eagle3 SKIP (https://nvbugs/6075431) +llmapi/test_llm_examples.py::test_llmapi_tensorrt_engine SKIP (https://nvbugs/5820553) +perf/test_perf.py::test_perf[bart_large_cnn-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20] SKIP -perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20] SKIP +perf/test_perf.py::test_perf[flan_t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20-gpus:2] SKIP +perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20] SKIP perf/test_perf.py::test_perf[flan_t5_large-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] SKIP -perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20] SKIP -perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:128,128] SKIP -perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:512,32] SKIP +perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8] SKIP +perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] SKIP +perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-quant:fp8-gpus:8] SKIP (SKIP due to timeout of quantization) +perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:512,200-quant:fp8-tp:4] SKIP (SKIP due to timeout of quantization) perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:128,128] SKIP perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:512,32] SKIP +perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:128,128] SKIP +perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:512,32] SKIP perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2] SKIP perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] SKIP -perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8] SKIP -perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] SKIP -full:NVIDIA_B200/perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8] SKIP (https://nvbugspro.nvidia.com/bug/5150255) -full:NVIDIA_B200/perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] SKIP (https://nvbugspro.nvidia.com/bug/5150255) -full:NVIDIA_B200/perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] SKIP (https://nvbugspro.nvidia.com/bug/5150255) -full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:128,128] SKIP (https://nvbugspro.nvidia.com/bug/5150255) -full:B200/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP (https://nvbugspro.nvidia.com/bug/5150255) -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5232405) -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058) -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058) -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5234058) -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5234058) -examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570) -triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5240060) -triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP -triton_server/test_triton.py::test_qwen2_vl[qwen2_vl] SKIP -triton_server/test_triton.py::test_gpt_ib_speculative_decoding_bls[gpt-ib-speculative-decoding-bls] SKIP -triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False-ensemble] SKIP -perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,200-quant:fp8-tp:4] SKIP (https://nvbugspro.nvidia.com/bug/5304388) -perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-cppmanager-exe-plugin_ifb-float16-input_output_len:200,2000-reqs:64-gpus:8] SKIP (https://nvbugspro.nvidia.com/bug/5304388) -perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5304388) -perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:20000,2000-quant:fp8-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5304388) -perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5304388) -perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:20000,2000-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5304388) -perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-con:1] SKIP (https://nvbugspro.nvidia.com/bug/5304388) -perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:20000,2000-reqs:500-con:250] SKIP (https://nvbugspro.nvidia.com/bug/5304388) -perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:5000,500-reqs:500-con:250] SKIP (https://nvbugspro.nvidia.com/bug/5304388) -perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:200,2000-reqs:64-con:1-gpus:8] SKIP (https://nvbugspro.nvidia.com/bug/5304388) -perf/test_perf.py::test_perf[llama_v3.1_8b-cppmanager-exe-plugin_ifb-bfloat16-mp-maxbs:256-input_output_len:128,128-pp:2] SKIP (https://nvbugspro.nvidia.com/bug/5295390) -perf/test_perf.py::test_perf[llama_70b_sq_per_tensor-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-gpus:2] SKIP (https://nvbugspro.nvidia.com/bug/5295390) -perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-ootb_except_mha-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390) -perf/test_perf.py::test_perf[mixtral_8x7b-cppmanager-exe-plugin_ifb-float16-mp-input_output_len:128,128+512,32-gpus:2] SKIP (https://nvbugspro.nvidia.com/bug/5295390) -perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-ootb_except_mha-float16-input_output_len:128,128+512,32-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390) -perf/test_perf.py::test_perf[llama_v3_8b_instruct-cppmanager-exe-plugin_ifb-bfloat16-gwp:0.5-input_output_len:128,128+512,32] SKIP (https://nvbugspro.nvidia.com/bug/5295390) -perf/test_perf.py::test_perf[gpt_350m_moe-cpp-plugin-float16-bs:64-input_output_len:128,8+512,32] SKIP (https://nvbugspro.nvidia.com/bug/5295390) -perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-exe-plugin_ifb-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390) -perf/test_perf.py::test_perf[flan_t5_xxl-cppmanager-exe-plugin_ifb-float16-input_output_len:512,32-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390) -perf/test_perf.py::test_perf[flan_t5_xxl-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390) -perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390) -perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:1024,1024-tp:2-gpus:2] SKIP (https://nvbugspro.nvidia.com/bug/5295411) -perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8-gpus:2] SKIP (https://nvbugspro.nvidia.com/bug/5295411) -perf/test_perf.py::test_perf[bart_large_cnn-bench-float16-input_output_len:128,20] SKIP (https://nvbugspro.nvidia.com/bug/5295411) -perf/test_perf.py::test_perf[mamba_130m-bench-float16-input_output_len:128,128] SKIP (https://nvbugspro.nvidia.com/bug/5295411) -perf/test_perf.py::test_perf[bert_large-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411) -perf/test_perf.py::test_perf[roberta_base-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411) -examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5333849) -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818) -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818) -triton_server/test_triton.py::test_mllama[mllama] SKIP (https://nvbugs/5333818) -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818) -unittest/llmapi/test_llm_multi_gpu.py -m "gpu4 and part0" SKIP (https://nvbugs/5348958) -accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] SKIP (https://nvbugs/5346443) -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5401233) -examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233) -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5419070) -examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5431132) -examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] SKIP (https://nvbugs/5444636) -examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (https://nvbugs/5447530) -examples/test_nemotron_nas.py::test_nemotron_nas_summary_2gpu[DeciLM-7B] SKIP (https://nvbugs/5444636) -examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5453709) -examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5453709) -triton_server/test_triton.py::test_gpt_ib[gpt-ib] SKIP (https://nvbugs/5431116) -accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197) -triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349) -triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624) -triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343) -triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482) -triton_server/test_triton.py::test_python_bls_unit_tests[python-bls-unit-tests] SKIP (https://nvbugs/5477392) -triton_server/test_triton.py::test_mistral_ib[mistral-ib] SKIP (https://nvbugs/5477399) -triton_server/test_triton.py::test_eagle[eagle] SKIP (https://nvbugs/5477378) +perf/test_perf.py::test_perf[t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) +perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20] SKIP +perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6215810) +perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] SKIP (https://nvbugs/6167060) +perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_dep8_mtp1_8k1k] SKIP (https://nvbugs/6190071) +perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_tep8_mtp3_8k1k] SKIP (https://nvbugs/6189928) +perf/test_perf_sanity.py::test_e2e[aggr_upload-llama3_1_8b_fp8_ad_hopper-llama3_1_8b_ad_ws1_1k1k] SKIP (https://nvbugs/6192201) +perf/test_perf_sanity.py::test_e2e[aggr_upload-super_ad_blackwell-super_ad_ws1_1k1k] SKIP (https://nvbugs/6153575) +perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6215844) +perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] SKIP (https://nvbugs/6085022) +perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6179661) +perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6215844) +perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] SKIP (https://nvbugs/6085022) +perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6016528) +perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] SKIP (https://nvbugs/6085022) +perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6085022) +perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL] SKIP (https://nvbugs/6200257) +perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6215844) +perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-flux2_blackwell-flux2_fp8_cfg1_ulysses4_teacache_on] SKIP (https://nvbugs/6162857) +perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-ltx2_blackwell-ltx2_2stage_bf16_i2v_cfg2_ulysses4_compile_on] SKIP (https://nvbugs/6162857) +perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-ltx2_blackwell-ltx2_2stage_bf16_t2v_cfg2_ulysses4_compile_on] SKIP (https://nvbugs/6162857) +perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-ltx2_blackwell-ltx2_nvfp4_i2v_cfg2_ulysses4_compile_on] SKIP (https://nvbugs/6162857) +perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-wan21_t2v_14b_blackwell-wan21_14b_nvfp4_trtllm_cfg2_ulysses4_teacache_on] SKIP (https://nvbugs/6162857) +perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-wan22_i2v_a14b_blackwell-wan22_i2v_a14b_nvfp4_trtllm_cfg2_ulysses4] SKIP (https://nvbugs/6162857) +stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy] SKIP (https://nvbugs/6207678) +stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy] SKIP (https://nvbugs/6207678) +stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp8-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy] SKIP (https://nvbugs/6143599) +stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_10800s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy] SKIP (https://nvbugs/6143599) +stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy] SKIP (https://nvbugs/6143599) +stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] SKIP (https://nvbugs/6215678) +stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] SKIP (https://nvbugs/6215678) +test_doc.py::test_url_validity SKIP (https://nvbugs/6215684) +test_e2e.py::test_draft_token_tree_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] SKIP (https://nvbugs/5989907) +test_e2e.py::test_draft_token_tree_quickstart_advanced_eagle3_depth_1_tree[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] SKIP (https://nvbugs/5989907) +test_e2e.py::test_multi_nodes_eval[DeepSeek-R1/DeepSeek-R1-0528-FP4-tp16-mmlu] SKIP (https://nvbugs/6114608) +test_e2e.py::test_multi_nodes_eval[Kimi-K2-Thinking-NVFP4-tp16-mmlu] SKIP (https://nvbugs/6114608) +test_e2e.py::test_multi_nodes_eval[Qwen3/Qwen3-235B-A22B-tp16-mmlu] SKIP (https://nvbugs/6115560) +test_e2e.py::test_multi_nodes_eval[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-tp16-mmlu] SKIP (https://nvbugs/6114608) +test_e2e.py::test_multi_nodes_eval[nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-tp16-mmlu] SKIP (https://nvbugs/6114608) test_e2e.py::test_openai_chat_example[trt] SKIP (https://nvbugs/5477444) +test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450) +test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp1pp2-gen_tp1pp2] SKIP (https://nvbugs/6190759) +test_e2e.py::test_openai_disagg_multi_nodes_completion_service_discovery[http] SKIP (https://nvbugs/6115562) +test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus[DeepSeek-R1-W4AFP8-DeepSeek-R1/DeepSeek-R1-W4AFP8] SKIP (https://nvbugs/5836830) test_e2e.py::test_trtllm_bench_iteration_log[TRT-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5448523) -accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype SKIP (https://nvbugs/5520319) -examples/test_llama.py::test_llm_llama_1gpu_fp8_kv_cache[llama-v2-7b-hf-bfloat16] SKIP (https://nvbugs/5527940) -examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8] SKIP (https://nvbugs/5546507) -examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1] SKIP (https://nvbugs/5546507) -examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] SKIP (https://nvbugs/5546507) -cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5550689) -cpp/test_e2e.py::test_benchmarks[bart-90] SKIP (https://nvbugs/5550689) -full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553) -full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5574553) -full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True] SKIP (https://nvbugs/5574553) -full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5574553) -full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553) -full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553) -full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553) -full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5574553) -full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True] SKIP (https://nvbugs/5574553) -full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5574553) -full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553) -full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553) +test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5864769) +triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359) +triton_server/test_triton.py::test_eagle[eagle] SKIP (https://nvbugs/5477378) triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora] SKIP (https://nvbugs/5470830) -unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781) +triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118) +triton_server/test_triton.py::test_gpt_gather_logits[gpt-gather-logits] SKIP (https://nvbugs/5766960) +triton_server/test_triton.py::test_gpt_ib[gpt-ib] SKIP (https://nvbugs/5431116) +triton_server/test_triton.py::test_gpt_ib_lad[gpt-ib-lad] SKIP (https://nvbugs/5775223) +triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624) +triton_server/test_triton.py::test_gpt_ib_speculative_decoding_bls[gpt-ib-speculative-decoding-bls] SKIP +triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349) +triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-decoding] SKIP (https://nvbugs/5762854) triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414) -full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-cutlass-auto] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-cutlass-auto] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-cutlass-auto] SKIP (https://nvbugs/5596343) -examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] SKIP (https://nvbugs/5612313) -triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359) -triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369) -examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] SKIP (https://nvbugs/5606233) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5616182) -full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551) -test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450) +triton_server/test_triton.py::test_llava_onevision[llava_onevision] SKIP (https://nvbugs/5775205) +triton_server/test_triton.py::test_mistral_ib[mistral-ib] SKIP (https://nvbugs/5477399) +triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343) +triton_server/test_triton.py::test_mllama[mllama] SKIP (https://nvbugs/5333818) +triton_server/test_triton.py::test_python_bls_unit_tests[python-bls-unit-tests] SKIP (https://nvbugs/5477392) +triton_server/test_triton.py::test_qwen2_vl[qwen2_vl] SKIP +triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482) +triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm] SKIP (https://nvbugs/5701480) +triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False-ensemble] SKIP +triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5240060) +triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369) +unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend[act=Relu2-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] SKIP (https://nvbugs/5989912) +unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=IGNORE-e8_k1_h512_i512-seq=8-dtype=torch.bfloat16-backend=MEGAMOE_DEEPGEMM-quant=W4A8_MXFP4_MXFP8-routing=DeepSeekV3] SKIP (https://nvbugs/6175060) +unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb -k "MEGAMOE_DEEPGEMM" SKIP (https://nvbugs/6175060) +unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e256_k8_h7168_i2048-seq=1-dtype=torch.bfloat16-backend=MEGAMOE_DEEPGEMM-quant=W4A8_MXFP4_MXFP8-routing=DeepSeekV3] SKIP (https://nvbugs/6175060) unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py::TestLoraAttentionPytorchFlowVsTRT::test_lora_attention SKIP (https://nvbugs/5701421) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] SKIP (https://nvbugs/5596343) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460) +unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460) +unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_cp_tp_broadcast_object[tp_cp_broadcast-list] SKIP (https://nvbugs/6132301) +unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingDSv3-swiglu-1024-1024-1] SKIP (https://nvbugs/5908070) +unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_qwen_next-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070) +unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_topk_4-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070) +unittest/_torch/visual_gen/test_flux_pipeline.py::TestFluxCombinedOptimizations::test_all_optimizations_combined SKIP (https://nvbugs/6199854) +unittest/auto_deploy/singlegpu/models/test_qwen3_5_moe.py::test_vision_attention_matches_reference SKIP (https://nvbugs/6189450) +unittest/auto_deploy/singlegpu/models/test_qwen3_5_moe.py::test_vision_block_matches_reference SKIP (https://nvbugs/6189450) +unittest/auto_deploy/singlegpu/models/test_qwen3_5_moe.py::test_vlm_wrapper_delta_is_request_scoped_no_cross_call_leakage SKIP (https://nvbugs/6189450) +unittest/auto_deploy/singlegpu/smoke/test_ad_build_small_single.py::test_build_ad[deepseek-ai/DeepSeek-V3-llm_extra_args10] SKIP (https://nvbugs/5888827) +unittest/auto_deploy/standalone SKIP (https://nvbugs/6160629) +unittest/auto_deploy/standalone/test_standalone_package.py::TestStandalonePackage::test_run_unit_tests SKIP (https://nvbugs/6160629) +unittest/disaggregated/test_agent_multi_backends.py::test_run_with_different_env[1] SKIP (https://nvbugs/5979673) unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_async SKIP (https://nvbugs/5741476) -cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941) -examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979) -examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979) -unittest/_torch/speculative/test_dynamic_spec_decode.py::test_dynamic_spec_decode SKIP (https://nvbugs/5758449) -triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118) -triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-decoding] SKIP (https://nvbugs/5762854) -examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5612502) unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741) unittest/executor/test_rpc_worker.py SKIP (https://nvbugs/5605741) -cpp/test_e2e.py::test_model[-redrafter-86] SKIP (https://nvbugs/5761642) -triton_server/test_triton.py::test_gpt_gather_logits[gpt-gather-logits] SKIP (https://nvbugs/5766960) -accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5772360) -accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model] SKIP (https://nvbugs/5772993) -accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5772995) -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] SKIP (https://nvbugs/5775326) -triton_server/test_triton.py::test_llava_onevision[llava_onevision] SKIP (https://nvbugs/5775205) -triton_server/test_triton.py::test_gpt_ib_lad[gpt-ib-lad] SKIP (https://nvbugs/5775223) -cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665) -unittest/_torch/ray_orchestrator/multi_gpu/test_multi_instance.py::test_multi_instance[tp2_2instances] SKIP (https://nvbugs/5784566) -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False] SKIP (https://nvbugs/5795918) -examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5802248) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model] SKIP (https://nvbugs/5596343) -examples/test_ray.py::test_llm_inference_distributed_ray[tp2pp2] SKIP (https://nvbugs/5781731) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/5819005) -unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks SKIP (https://nvbugs/5819014) -unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_gptoss_style_nvfp4[limit1-beta0-alpha1-RoutingGPTOSS-512-512-1] SKIP (https://nvbugs/5819042) -llmapi/test_llm_examples.py::test_llmapi_tensorrt_engine SKIP (https://nvbugs/5820553) -accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse SKIP (https://nvbugs/5820497) -accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar] SKIP (https://nvbugs/5820497) -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True] SKIP (https://nvbugs/5821415) -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True] SKIP (https://nvbugs/5821415) -test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus[DeepSeek-R1-W4AFP8-DeepSeek-R1/DeepSeek-R1-W4AFP8] SKIP (https://nvbugs/5836830) -accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664) -cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5838211) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5838211) -full:A10/unittest/kv_cache_manager_v2_tests/ SKIP (https://nvbugs/5841954) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[disable_skip_indexer] SKIP (https://nvbugs/5859886) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-fp8] SKIP (https://nvbugs/5651865) -test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5864769) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5879577) -unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_gptoss_style_nvfp4[limitinf-beta0-alpha0.1-RoutingGPTOSS-512-512-1] SKIP (https://nvbugs/5819042) -unittest/_torch/flashinfer/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5920779) -test_e2e.py::test_openai_chat_guided_decoding[openai/gpt-oss-120b] SKIP (https://nvbugs/5884677) -unittest/auto_deploy/singlegpu/smoke/test_ad_build_small_single.py::test_build_ad[deepseek-ai/DeepSeek-V3-llm_extra_args10] SKIP (https://nvbugs/5888827) -unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingDSv3-swiglu-1024-1024-1] SKIP (https://nvbugspro.nvidia.com/bug/5908070) -unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_qwen_next-swiglu-1024-1024-150] SKIP (https://nvbugspro.nvidia.com/bug/5908070) -unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_topk_4-swiglu-1024-1024-150] SKIP (https://nvbugspro.nvidia.com/bug/5908070) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916092) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugspro.nvidia.com/bug/5916155) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugspro.nvidia.com/bug/5916155) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugspro.nvidia.com/bug/5916155) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916155) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916155) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugspro.nvidia.com/bug/5916155) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916155) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916155) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916155) -unittest/_torch/visual_gen/test_wan.py::TestWanTwoStageTransformer::test_two_stage_with_trtllm_attention SKIP (https://nvbugspro.nvidia.com/bug/5916830) -full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5929339) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-fp16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_mm_add_prologue[2-bf16-_tokens16-_hidden32] SKIP (https://nvbugs/5940460) -cpp/test_e2e.py::test_model[-gpt-80] SKIP (https://nvbugs/5983283) -full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-cutlass-one_model-overlap_scheduler] SKIP (https://nvbugs/5945047) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081) -full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5948435) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] SKIP (https://nvbugs/5955792) -full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/5961814) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_2_model_mtp SKIP (https://nvbugs/5966585) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] SKIP (https://nvbugs/5927636) -full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5961814) -test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] SKIP (https://nvbugs/5969725) -accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_accuracy[nvfp4-4-attn_dp_on-trtllm] SKIP (https://nvbugs/5973199) -unittest/disaggregated/test_agent_multi_backends.py::test_run_with_different_env[1] SKIP (https://nvbugs/5979673) -verl/test_verl_cases.py::test_adapter SKIP (https://nvbugs/5981833) -verl/test_verl_cases.py::test_async_server SKIP (https://nvbugs/5981833) -verl/test_verl_cases.py::test_rollout_utils SKIP (https://nvbugs/5981833) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/5839028) -accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput_trtllm] SKIP (https://nvbugs/5981293) -accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_2_model_mtp[2model] SKIP (https://nvbugs/5981293) -test_e2e.py::test_draft_token_tree_quickstart_advanced_eagle3_depth_1_tree[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] SKIP (https://nvbugs/5989907) -test_e2e.py::test_draft_token_tree_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] SKIP (https://nvbugs/5989907) -unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend[act=Relu2-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] SKIP (https://nvbugs/5989912) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_piecewise_cuda_graph[mtp3_fp8kv_chunked] SKIP (https://nvbugs/5989920) -accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5992113) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[use_temperature=False-attn_backend=TRTLLM] SKIP (https://nvbugs/5997547) -accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8 SKIP (https://nvbugs/6004530) -unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] SKIP (https://nvbugs/6007285) -accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[latency] SKIP (https://nvbugs/6012526) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/6018046) -test_fmha.py::test_fmha SKIP (https://nvbugs/6018058) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/6029882) -accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_accuracy[bf16-4-attn_dp_off-trtllm] SKIP (https://nvbugs/5919796) -accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_accuracy[fp8-4-attn_dp_off-trtllm] SKIP (https://nvbugs/6058066) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False-v2_kv_cache=False] SKIP (https://nvbugs/6027594) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard[overlap_scheduler=True] SKIP (https://nvbugs/6037653) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard[overlap_scheduler=False] SKIP (https://nvbugs/6037653) -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/6037654) -examples/test_visual_gen.py::test_vbench_dimension_score_wan22_a14b_nvfp4 SKIP (https://nvbugs/6050483) -visual_gen/test_visual_gen_benchmark.py::test_online_benchmark[openai-videos] SKIP (https://nvbugs/6050483) -examples/test_visual_gen.py::test_vbench_dimension_score_wan SKIP (https://nvbugs/6050483) -examples/test_visual_gen.py::test_vbench_dimension_score_wan22_a14b_fp8 SKIP (https://nvbugs/6050483) +unittest/llmapi/test_llm_multi_gpu.py -m "gpu4 and part0" SKIP (https://nvbugs/5348958) +unittest/llmapi/test_llm_multi_gpu_pytorch.py::test_phi3_lora_fused_modules_output_on_tp2_identical_to_tp1 SKIP (https://nvbugs/6109745) +unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781) +unittest/tools/test_layer_wise_benchmarks.py::test_performance_alignment[1] SKIP (https://nvbugs/6127669) +unittest/tools/test_layer_wise_benchmarks.py::test_qwen3_next_gen_tep[1] SKIP (https://nvbugs/6153575) visual_gen/test_visual_gen_benchmark.py::test_offline_benchmark SKIP (https://nvbugs/6050483) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[pp4-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/6050487) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6050489) -disaggregated/test_disaggregated.py::test_disaggregated_gpt_oss_120b_harmony[gpt_oss/gpt-oss-120b] SKIP (https://nvbugs/6011317) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6050489) -disaggregated/test_disaggregated.py::test_disaggregated_overlap_gen_first[ctx_pp1-TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6057459) -disaggregated/test_disaggregated.py::test_disaggregated_overlap_gen_first[ctx_pp4-TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6057460) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] SKIP (https://nvbugs/6085022) -full:sm89/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=False] SKIP (https://nvbugs/6070878) -full:sm89/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=True] SKIP (https://nvbugs/6070878) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto] SKIP (https://nvbugs/6026676) -accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus] SKIP (https://nvbugs/6069790) -accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5981293) -accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput] SKIP (https://nvbugs/5981293) -disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6069686) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6074784) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx_tp1_single_gpu[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6074784) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6074784) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6074784) -accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[google_gemma-3-1b-it-False] SKIP (https://nvbugs/6059036) -accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[meta-llama_Llama-3.1-8B-Instruct-False] SKIP (https://nvbugs/6059036) -accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[mistralai_Codestral-22B-v0.1-False] SKIP (https://nvbugs/6059036) -accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[mistralai_Ministral-8B-Instruct-2410-False] SKIP (https://nvbugs/6059036) -accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[Qwen_QwQ-32B-False] SKIP (https://nvbugs/6059036) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/6050489) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6050489) -accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[deepseek-ai_DeepSeek-R1-0528-True] SKIP (https://nvbugs/6070955) -accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B_Instruct_Eagle3::test_eagle3_one_model SKIP (https://nvbugs/5997534) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6076560) -accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[trtllm-flashinfer_ssm-False] SKIP (https://nvbugs/6076564) -unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py SKIP (https://nvbugs/6076624) -unittest/llmapi/test_llm_pytorch.py::test_llm_disagg_streaming_gen_cancelled SKIP (https://nvbugs/6078431) -unittest/auto_deploy/singlegpu/transformations/library/test_mrope_delta_cache.py::test_qwen_registry_configs_explicitly_enable_mrope_delta_cache SKIP (https://nvbugs/6078421) -accuracy/test_llm_api_autodeploy.py::TestQwen3_5_397B_MoE::test_nvfp4[8] SKIP (https://nvbugs/6080024) -accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[meta-llama_Llama-3.3-70B-Instruct-False] SKIP (https://nvbugs/6059036) -full:DGX_H100/unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_reducescatter_pg_op[var_len:True-seqlen:64-hidden:128] SKIP (https://nvbugs/6080037) -llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_eagle3 SKIP (https://nvbugs/6075431) -llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/6079440) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6079919) -disaggregated/test_disaggregated.py::test_disaggregated_benchmark_gen_only_insufficient_kv[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632) -disaggregated/test_disaggregated.py::test_disaggregated_conditional[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632) -disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632) -disaggregated/test_disaggregated.py::test_disaggregated_kv_cache_time_output[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632) -disaggregated/test_disaggregated.py::test_disaggregated_ngram[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632) -disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632) -disaggregated/test_disaggregated.py::test_disaggregated_single_gpu[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6087632) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6084720) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] SKIP (https://nvbugs/6084764) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] SKIP (https://nvbugs/6084775) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6084824) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True] SKIP (https://nvbugs/6084445) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=0] SKIP (https://nvbugs/6084447) -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] SKIP (https://nvbugs/6084568) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/6088149) -accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/6070857) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/6094071) -accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray SKIP (https://nvbugs/6094070) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_dummy_load_format SKIP (https://nvbugs/6094072) -cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-nixl_kvcache-90] SKIP (https://nvbugs/6093820) -cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-ucx_kvcache-90] SKIP (https://nvbugs/6093820) -accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[fp8-1-trtllm] SKIP (https://nvbugs/6094208) -accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-1-trtllm] SKIP (https://nvbugs/6094208) -accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[torch-True-1] SKIP (https://nvbugs/6093714) -accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-True] SKIP (https://nvbugs/6093713) -accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-False] SKIP (https://nvbugs/6093713) -accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_gen_first[ctx_tp1pp1-gen_tp1pp1] SKIP (https://nvbugs/6093712) -accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap_adp_on] SKIP (https://nvbugs/6094068) -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9-fp8kv=True] SKIP (https://nvbugs/6094066) -accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[nvidia_Llama-3.1-8B-Instruct-NVFP4-True] SKIP (https://nvbugs/6093715) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6084447) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6084447) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=2] SKIP (https://nvbugs/6084447) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=True] SKIP (https://nvbugs/6084447) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6084447) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=True] SKIP (https://nvbugs/6084447) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6084447) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6084447) -disaggregated/test_auto_scaling.py::test_service_discovery[etcd-kv_cache_aware] SKIP (https://nvbugs/6094100) -disaggregated/test_disaggregated.py::test_disaggregated_chat_completion_tool_calls[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_service_discovery[http-load_balancing] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin] SKIP (https://nvbugs/6094100) -disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_worker_restart[http-round_robin] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_service_discovery[etcd-load_balancing] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/6094100) -disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_gentp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) -disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/6094100) -disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_trt_backend[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) -disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) -disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_genpp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/6094100) -disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_trt_backend[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) -disaggregated/test_workers.py::test_workers_conditional_disaggregation_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_service_discovery[http-kv_cache_aware] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin] SKIP (https://nvbugs/6094100) -disaggregated/test_disaggregated.py::test_disaggregated_diff_max_tokens[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) -disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_worker_restart[etcd-kv_cache_aware] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_disagg_server_restart[http-round_robin] SKIP (https://nvbugs/6094100) -disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) -disaggregated/test_disaggregated.py::test_disaggregated_perf_metrics[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_worker_restart[http-load_balancing] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_worker_restart[http-kv_cache_aware] SKIP (https://nvbugs/6094100) -disaggregated/test_auto_scaling.py::test_service_discovery[http-round_robin] SKIP (https://nvbugs/6094100) -disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_llama_context_capacity[False-False-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/6095421) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6095421) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/6095421) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/6095421) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[pp4-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/6095421) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6095421) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6095421) -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM] SKIP (https://nvbugs/6095421) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6095421) -perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_tep8_mtp3] SKIP (https://nvbugs/6095700) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6095851) -full:DGX_B200/examples/test_visual_gen.py::test_wan_t2v_example SKIP (https://nvbugs/6097980) -accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/6098442) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6098790) +visual_gen/test_visual_gen_benchmark.py::test_online_benchmark[openai-videos] SKIP (https://nvbugs/6050483) From 209630d412d916156e8bd2d46ef3728b8020917a Mon Sep 17 00:00:00 2001 From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> Date: Sun, 7 Jun 2026 21:07:12 -0700 Subject: [PATCH 2/2] [TensorRT-LLM][SpecDec] Capture count penalty in CUDA graphs CUDA graph warmup can capture speculative sampling without the generated-token count frequency-penalty path when warmup requests have no frequency penalty. Later RWLT GPT-OSS disagg requests replay that graph with frequency_penalty and prompt_ignore_length, so repeated generated tokens are not penalized. Add speculative logits penalty CUDA ops, preserve sequence-slot count state across CUDA graph metadata/replay, append accepted tokens back into count state, and gate forced graph count capture to the disaggregated generation role by default. Validation: python3 -m py_compile on modified Python modules; git diff --cached --check; GPT-OSS disagg original NVBug config ran 8 total auto-gating runs with >10k=0 and 16K/length=0. --- .../logitsPenaltyKernels.cu | 498 +++++++ .../logitsPenaltyKernels.h | 56 + cpp/tensorrt_llm/thop/CMakeLists.txt | 1 + .../thop/speculativeLogitsPenaltyOp.cpp | 473 ++++++ .../_torch/pyexecutor/model_engine.py | 114 +- tensorrt_llm/_torch/speculative/interface.py | 1297 ++++++++++++++++- .../_torch/speculative/one_model_sampler.py | 101 ++ .../_torch/speculative/spec_sampler_base.py | 432 +++++- tensorrt_llm/_torch/speculative/utils.py | 12 +- 9 files changed, 2966 insertions(+), 18 deletions(-) create mode 100644 cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.cu create mode 100644 cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.h create mode 100644 cpp/tensorrt_llm/thop/speculativeLogitsPenaltyOp.cpp diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.cu new file mode 100644 index 00000000000..695748b1bdc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.cu @@ -0,0 +1,498 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. SPDX-License-Identifier: Apache-2.0 + */ + +#include "tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.h" + +#include +#include +#include + +TRTLLM_NAMESPACE_BEGIN + +namespace kernels +{ + +template +__global__ void applySpeculativeTokenPenaltiesKernel( + T* logits, TokenT const* tokenIds, float const* penaltyValues, int32_t width, int32_t vocabSize) +{ + auto const row = static_cast(blockIdx.x); + auto const rowTokenIds = tokenIds + row * width; + auto const rowPenaltyValues = penaltyValues + row * width; + auto rowLogits = logits + row * vocabSize; + + if (width <= 32) + { + if (threadIdx.x != 0) + { + return; + } + for (auto idx = 0; idx < width; ++idx) + { + auto const tokenId = static_cast(rowTokenIds[idx]); + auto const penalty = rowPenaltyValues[idx]; + if (penalty != 0.0f && tokenId >= 0 && tokenId < vocabSize) + { + auto const offset = static_cast(tokenId); + auto logit = static_cast(rowLogits[offset]); + logit -= penalty; + rowLogits[offset] = static_cast(logit); + } + } + return; + } + + for (auto idx = static_cast(threadIdx.x); idx < width; idx += static_cast(blockDim.x)) + { + auto const tokenId = static_cast(rowTokenIds[idx]); + auto const penalty = rowPenaltyValues[idx]; + if (penalty != 0.0f && tokenId >= 0 && tokenId < vocabSize) + { + auto const offset = static_cast(tokenId); + auto logit = static_cast(rowLogits[offset]); + logit -= penalty; + rowLogits[offset] = static_cast(logit); + } + } +} + +template +void invokeApplySpeculativeTokenPenalties(T* logits, TokenT const* tokenIds, float const* penaltyValues, int32_t numRows, + int32_t width, int32_t vocabSize, cudaStream_t stream) +{ + if (numRows == 0 || width == 0) + { + return; + } + + dim3 const grid(numRows); + dim3 const block(std::min(width, 256)); + applySpeculativeTokenPenaltiesKernel<<>>( + logits, tokenIds, penaltyValues, width, vocabSize); +} + +template void invokeApplySpeculativeTokenPenalties( + float*, int32_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t); +template void invokeApplySpeculativeTokenPenalties( + float*, int64_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t); +template void invokeApplySpeculativeTokenPenalties( + half*, int32_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t); +template void invokeApplySpeculativeTokenPenalties( + half*, int64_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t); +template void invokeApplySpeculativeTokenPenalties<__nv_bfloat16, int32_t>( + __nv_bfloat16*, int32_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t); +template void invokeApplySpeculativeTokenPenalties<__nv_bfloat16, int64_t>( + __nv_bfloat16*, int64_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t); + +__global__ void applySpeculativeHistoryFrequencyPenaltyKernel(float* logits, int32_t const* historyTokens, + int32_t const* historyLens, int32_t const* rowSlots, float const* frequencyPenalties, int32_t historyCapacity, + int32_t vocabSize) +{ + auto const row = static_cast(blockIdx.x); + auto const frequencyPenalty = frequencyPenalties[row]; + if (frequencyPenalty == 0.0f) + { + return; + } + + auto const slot = rowSlots[row]; + if (slot < 0) + { + return; + } + + auto const historyLen = min(max(historyLens[slot], 0), historyCapacity); + auto const rowHistory = historyTokens + static_cast(slot) * historyCapacity; + auto rowLogits = logits + static_cast(row) * vocabSize; + + for (auto idx = static_cast(threadIdx.x); idx < historyLen; idx += static_cast(blockDim.x)) + { + auto const tokenId = rowHistory[idx]; + if (tokenId >= 0 && tokenId < vocabSize) + { + atomicAdd(rowLogits + tokenId, -frequencyPenalty); + } + } +} + +void invokeApplySpeculativeHistoryFrequencyPenalty(float* logits, int32_t const* historyTokens, + int32_t const* historyLens, int32_t const* rowSlots, float const* frequencyPenalties, int32_t numRows, + int32_t historyCapacity, int32_t vocabSize, cudaStream_t stream) +{ + if (numRows == 0 || historyCapacity == 0) + { + return; + } + + dim3 const grid(numRows); + dim3 const block(256); + applySpeculativeHistoryFrequencyPenaltyKernel<<>>( + logits, historyTokens, historyLens, rowSlots, frequencyPenalties, historyCapacity, vocabSize); +} + +__global__ void appendSpeculativeAcceptedTokensKernel(int32_t* historyTokens, int32_t* historyLens, + int32_t const* seqSlots, int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t acceptedStride, + int32_t historyCapacity) +{ + auto const row = static_cast(blockIdx.x); + auto const slot = seqSlots[row]; + if (slot < 0) + { + return; + } + + auto const acceptedLen = max(acceptedLens[row], 0); + if (acceptedLen == 0) + { + return; + } + + auto const oldLen = min(max(historyLens[slot], 0), historyCapacity); + auto const writeLen = min(acceptedLen, max(historyCapacity - oldLen, 0)); + auto const rowAccepted = acceptedTokens + static_cast(row) * acceptedStride; + auto rowHistory = historyTokens + static_cast(slot) * historyCapacity; + + for (auto idx = static_cast(threadIdx.x); idx < writeLen; idx += static_cast(blockDim.x)) + { + rowHistory[oldLen + idx] = rowAccepted[idx]; + } + + if (threadIdx.x == 0) + { + historyLens[slot] = oldLen + writeLen; + } +} + +void invokeAppendSpeculativeAcceptedTokens(int32_t* historyTokens, int32_t* historyLens, int32_t const* seqSlots, + int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t numRows, int32_t acceptedStride, + int32_t historyCapacity, cudaStream_t stream) +{ + if (numRows == 0 || acceptedStride == 0 || historyCapacity == 0) + { + return; + } + + dim3 const grid(numRows); + dim3 const block(std::min(acceptedStride, 256)); + appendSpeculativeAcceptedTokensKernel<<>>( + historyTokens, historyLens, seqSlots, acceptedTokens, acceptedLens, acceptedStride, historyCapacity); +} + +template +__global__ void applySpeculativeCountFrequencyPenaltyKernel(T* logits, int32_t const* tokenCounts, + int32_t const* rowSlots, float const* frequencyPenalties, int32_t vocabSize) +{ + auto const row = static_cast(blockIdx.x); + auto const tokenId = static_cast(blockIdx.y) * static_cast(blockDim.x) + + static_cast(threadIdx.x); + if (tokenId >= vocabSize) + { + return; + } + + auto const frequencyPenalty = frequencyPenalties[row]; + if (frequencyPenalty == 0.0f) + { + return; + } + + auto const slot = rowSlots[row]; + if (slot < 0) + { + return; + } + + auto const count = tokenCounts[static_cast(slot) * vocabSize + tokenId]; + if (count <= 0) + { + return; + } + + auto rowLogits = logits + static_cast(row) * vocabSize; + auto logit = static_cast(rowLogits[tokenId]); + logit -= frequencyPenalty * static_cast(count); + rowLogits[tokenId] = static_cast(logit); +} + +template +void invokeApplySpeculativeCountFrequencyPenalty(T* logits, int32_t const* tokenCounts, + int32_t const* rowSlots, float const* frequencyPenalties, int32_t numRows, int32_t vocabSize, cudaStream_t stream) +{ + if (numRows == 0 || vocabSize == 0) + { + return; + } + + dim3 const block(256); + dim3 const grid(numRows, (vocabSize + static_cast(block.x) - 1) / static_cast(block.x)); + applySpeculativeCountFrequencyPenaltyKernel<<>>( + logits, tokenCounts, rowSlots, frequencyPenalties, vocabSize); +} + +template void invokeApplySpeculativeCountFrequencyPenalty( + float*, int32_t const*, int32_t const*, float const*, int32_t, int32_t, cudaStream_t); +template void invokeApplySpeculativeCountFrequencyPenalty( + half*, int32_t const*, int32_t const*, float const*, int32_t, int32_t, cudaStream_t); +template void invokeApplySpeculativeCountFrequencyPenalty<__nv_bfloat16>( + __nv_bfloat16*, int32_t const*, int32_t const*, float const*, int32_t, int32_t, cudaStream_t); + +__global__ void appendSpeculativeAcceptedTokenCountsKernel(int32_t* tokenCounts, int32_t const* seqSlots, + int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t acceptedStride, int32_t vocabSize) +{ + auto const row = static_cast(blockIdx.x); + auto const slot = seqSlots[row]; + if (slot < 0) + { + return; + } + + auto const acceptedLen = min(max(acceptedLens[row], 0), acceptedStride); + auto const rowAccepted = acceptedTokens + static_cast(row) * acceptedStride; + auto rowCounts = tokenCounts + static_cast(slot) * vocabSize; + + for (auto idx = static_cast(threadIdx.x); idx < acceptedLen; idx += static_cast(blockDim.x)) + { + auto const tokenId = rowAccepted[idx]; + if (tokenId >= 0 && tokenId < vocabSize) + { + atomicAdd(rowCounts + tokenId, 1); + } + } +} + +void invokeAppendSpeculativeAcceptedTokenCounts(int32_t* tokenCounts, int32_t const* seqSlots, + int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t numRows, int32_t acceptedStride, + int32_t vocabSize, cudaStream_t stream) +{ + if (numRows == 0 || acceptedStride == 0 || vocabSize == 0) + { + return; + } + + dim3 const grid(numRows); + dim3 const block(std::min(acceptedStride, 256)); + appendSpeculativeAcceptedTokenCountsKernel<<>>( + tokenCounts, seqSlots, acceptedTokens, acceptedLens, acceptedStride, vocabSize); +} + +template +__global__ void applySpeculativeSparseCountFrequencyPenaltyKernel(T* logits, int32_t const* tokenIds, + int32_t const* tokenCounts, int32_t const* countLens, int32_t const* rowSlots, float const* frequencyPenalties, + int32_t numRows, int32_t countCapacity, int32_t vocabSize) +{ + auto const row = static_cast(blockIdx.x); + auto const frequencyPenalty = frequencyPenalties[row]; + if (frequencyPenalty == 0.0f) + { + return; + } + + auto const slot = rowSlots[row]; + if (slot < 0) + { + return; + } + + if (row > 0 && rowSlots[row - 1] == slot && frequencyPenalties[row - 1] == frequencyPenalty) + { + return; + } + + auto rowEnd = row + 1; + while (rowEnd < numRows && rowSlots[rowEnd] == slot && frequencyPenalties[rowEnd] == frequencyPenalty) + { + ++rowEnd; + } + + auto const countLen = min(max(countLens[slot], 0), countCapacity); + auto const rowTokenIds = tokenIds + static_cast(slot) * countCapacity; + auto const rowTokenCounts = tokenCounts + static_cast(slot) * countCapacity; + + for (auto idx = static_cast(threadIdx.x); idx < countLen; idx += static_cast(blockDim.x)) + { + auto const tokenId = rowTokenIds[idx]; + auto const count = rowTokenCounts[idx]; + if (count > 0 && tokenId >= 0 && tokenId < vocabSize) + { + for (auto applyRow = row; applyRow < rowEnd; ++applyRow) + { + auto rowLogits = logits + static_cast(applyRow) * vocabSize; + auto logit = static_cast(rowLogits[tokenId]); + logit -= frequencyPenalty * static_cast(count); + rowLogits[tokenId] = static_cast(logit); + } + } + } +} + +template +void invokeApplySpeculativeSparseCountFrequencyPenalty(T* logits, int32_t const* tokenIds, + int32_t const* tokenCounts, int32_t const* countLens, int32_t const* rowSlots, float const* frequencyPenalties, + int32_t numRows, int32_t countCapacity, int32_t vocabSize, cudaStream_t stream) +{ + if (numRows == 0 || countCapacity == 0 || vocabSize == 0) + { + return; + } + + dim3 const grid(numRows); + dim3 const block(256); + applySpeculativeSparseCountFrequencyPenaltyKernel<<>>(logits, tokenIds, tokenCounts, + countLens, rowSlots, frequencyPenalties, numRows, countCapacity, vocabSize); +} + +template void invokeApplySpeculativeSparseCountFrequencyPenalty(float*, int32_t const*, int32_t const*, + int32_t const*, int32_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t); +template void invokeApplySpeculativeSparseCountFrequencyPenalty(half*, int32_t const*, int32_t const*, + int32_t const*, int32_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t); +template void invokeApplySpeculativeSparseCountFrequencyPenalty<__nv_bfloat16>(__nv_bfloat16*, int32_t const*, + int32_t const*, int32_t const*, int32_t const*, float const*, int32_t, int32_t, int32_t, cudaStream_t); + +__global__ void appendSpeculativeSparseTokenCountsKernel(int32_t* tokenIds, int32_t* tokenCounts, int32_t* countLens, + int32_t const* seqSlots, int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t acceptedStride, + int32_t countCapacity, int32_t vocabSize) +{ + auto const row = static_cast(blockIdx.x); + auto const slot = seqSlots[row]; + if (slot < 0) + { + return; + } + + auto len = min(max(countLens[slot], 0), countCapacity); + auto rowTokenIds = tokenIds + static_cast(slot) * countCapacity; + auto rowTokenCounts = tokenCounts + static_cast(slot) * countCapacity; + auto const rowAccepted = acceptedTokens + static_cast(row) * acceptedStride; + auto const acceptedLen = min(max(acceptedLens[row], 0), acceptedStride); + + __shared__ int32_t lenShared; + __shared__ int32_t tokenIdShared; + __shared__ int32_t foundIdx; + + if (threadIdx.x == 0) + { + lenShared = len; + } + __syncthreads(); + + for (auto acceptedIdx = 0; acceptedIdx < acceptedLen; ++acceptedIdx) + { + if (threadIdx.x == 0) + { + tokenIdShared = rowAccepted[acceptedIdx]; + foundIdx = -1; + } + __syncthreads(); + + auto const tokenId = tokenIdShared; + if (tokenId >= 0 && tokenId < vocabSize) + { + auto const currentLen = lenShared; + for (auto idx = static_cast(threadIdx.x); idx < currentLen; + idx += static_cast(blockDim.x)) + { + if (rowTokenIds[idx] == tokenId) + { + atomicCAS(&foundIdx, -1, idx); + } + } + __syncthreads(); + + if (threadIdx.x == 0) + { + if (foundIdx >= 0) + { + rowTokenCounts[foundIdx] += 1; + } + else if (lenShared < countCapacity) + { + rowTokenIds[lenShared] = tokenId; + rowTokenCounts[lenShared] = 1; + ++lenShared; + } + } + __syncthreads(); + } + } + + if (threadIdx.x == 0) + { + countLens[slot] = lenShared; + } +} + +void invokeAppendSpeculativeSparseTokenCounts(int32_t* tokenIds, int32_t* tokenCounts, int32_t* countLens, + int32_t const* seqSlots, int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t numRows, + int32_t acceptedStride, int32_t countCapacity, int32_t vocabSize, cudaStream_t stream) +{ + if (numRows == 0 || acceptedStride == 0 || countCapacity == 0 || vocabSize == 0) + { + return; + } + + dim3 const grid(numRows); + dim3 const block(std::min(countCapacity, 256)); + appendSpeculativeSparseTokenCountsKernel<<>>(tokenIds, tokenCounts, countLens, seqSlots, + acceptedTokens, acceptedLens, acceptedStride, countCapacity, vocabSize); +} + +__global__ void initSpeculativeSparseTokenCountsKernel(int32_t* tokenIds, int32_t* tokenCounts, int32_t* countLens, + int32_t const* promptTokenIds, int32_t const* promptTokenCounts, int32_t const* promptLens, + int32_t const* seqSlots, int32_t promptCapacity, int32_t countCapacity, int32_t vocabSize) +{ + auto const row = static_cast(blockIdx.x); + auto const slot = seqSlots[row]; + if (slot < 0) + { + return; + } + + auto const len = min(min(max(promptLens[row], 0), promptCapacity), countCapacity); + auto rowTokenIds = tokenIds + static_cast(slot) * countCapacity; + auto rowTokenCounts = tokenCounts + static_cast(slot) * countCapacity; + auto const rowPromptTokenIds = promptTokenIds + static_cast(row) * promptCapacity; + auto const rowPromptTokenCounts = promptTokenCounts + static_cast(row) * promptCapacity; + + for (auto idx = static_cast(threadIdx.x); idx < len; idx += static_cast(blockDim.x)) + { + auto const tokenId = rowPromptTokenIds[idx]; + auto const count = rowPromptTokenCounts[idx]; + if (tokenId >= 0 && tokenId < vocabSize && count > 0) + { + rowTokenIds[idx] = tokenId; + rowTokenCounts[idx] = count; + } + else + { + rowTokenIds[idx] = 0; + rowTokenCounts[idx] = 0; + } + } + + if (threadIdx.x == 0) + { + countLens[slot] = len; + } +} + +void invokeInitSpeculativeSparseTokenCounts(int32_t* tokenIds, int32_t* tokenCounts, int32_t* countLens, + int32_t const* promptTokenIds, int32_t const* promptTokenCounts, int32_t const* promptLens, + int32_t const* seqSlots, int32_t numRows, int32_t promptCapacity, int32_t countCapacity, int32_t vocabSize, + cudaStream_t stream) +{ + if (numRows == 0 || promptCapacity == 0 || countCapacity == 0 || vocabSize == 0) + { + return; + } + + dim3 const grid(numRows); + dim3 const block(std::min(promptCapacity, 256)); + initSpeculativeSparseTokenCountsKernel<<>>(tokenIds, tokenCounts, countLens, promptTokenIds, + promptTokenCounts, promptLens, seqSlots, promptCapacity, countCapacity, vocabSize); +} + +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.h new file mode 100644 index 00000000000..b9e7bd599d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.h @@ -0,0 +1,56 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "tensorrt_llm/common/config.h" +#include "tensorrt_llm/common/cudaDriverWrapper.h" +#include "tensorrt_llm/common/cudaUtils.h" + +#include +#include + +TRTLLM_NAMESPACE_BEGIN + +namespace kernels +{ + +template +void invokeApplySpeculativeTokenPenalties(T* logits, TokenT const* tokenIds, float const* penaltyValues, int32_t numRows, + int32_t width, int32_t vocabSize, cudaStream_t stream); + +void invokeApplySpeculativeHistoryFrequencyPenalty(float* logits, int32_t const* historyTokens, + int32_t const* historyLens, int32_t const* rowSlots, float const* frequencyPenalties, int32_t numRows, + int32_t historyCapacity, int32_t vocabSize, cudaStream_t stream); + +void invokeAppendSpeculativeAcceptedTokens(int32_t* historyTokens, int32_t* historyLens, int32_t const* seqSlots, + int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t numRows, int32_t acceptedStride, + int32_t historyCapacity, cudaStream_t stream); + +template +void invokeApplySpeculativeCountFrequencyPenalty(T* logits, int32_t const* tokenCounts, + int32_t const* rowSlots, float const* frequencyPenalties, int32_t numRows, int32_t vocabSize, cudaStream_t stream); + +void invokeAppendSpeculativeAcceptedTokenCounts(int32_t* tokenCounts, int32_t const* seqSlots, + int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t numRows, int32_t acceptedStride, + int32_t vocabSize, cudaStream_t stream); + +template +void invokeApplySpeculativeSparseCountFrequencyPenalty(T* logits, int32_t const* tokenIds, + int32_t const* tokenCounts, int32_t const* countLens, int32_t const* rowSlots, float const* frequencyPenalties, + int32_t numRows, int32_t countCapacity, int32_t vocabSize, cudaStream_t stream); + +void invokeAppendSpeculativeSparseTokenCounts(int32_t* tokenIds, int32_t* tokenCounts, int32_t* countLens, + int32_t const* seqSlots, int32_t const* acceptedTokens, int32_t const* acceptedLens, int32_t numRows, + int32_t acceptedStride, int32_t countCapacity, int32_t vocabSize, cudaStream_t stream); + +void invokeInitSpeculativeSparseTokenCounts(int32_t* tokenIds, int32_t* tokenCounts, int32_t* countLens, + int32_t const* promptTokenIds, int32_t const* promptTokenCounts, int32_t const* promptLens, + int32_t const* seqSlots, int32_t numRows, int32_t promptCapacity, int32_t countCapacity, int32_t vocabSize, + cudaStream_t stream); + +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt index 0d077c4b030..16fdccf207e 100644 --- a/cpp/tensorrt_llm/thop/CMakeLists.txt +++ b/cpp/tensorrt_llm/thop/CMakeLists.txt @@ -108,6 +108,7 @@ add_library( weightOnlyQuantGemm.cpp weightOnlyQuantOp.cpp specDecOp.cpp + speculativeLogitsPenaltyOp.cpp loraOp.cpp finegrained_mixed_dtype_gemm_thop.cpp tinygemm2.cpp diff --git a/cpp/tensorrt_llm/thop/speculativeLogitsPenaltyOp.cpp b/cpp/tensorrt_llm/thop/speculativeLogitsPenaltyOp.cpp new file mode 100644 index 00000000000..082748f3cc8 --- /dev/null +++ b/cpp/tensorrt_llm/thop/speculativeLogitsPenaltyOp.cpp @@ -0,0 +1,473 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. SPDX-License-Identifier: Apache-2.0 + */ + +#include "tensorrt_llm/kernels/speculativeDecoding/logitsPenaltyKernels.h" +#include "tensorrt_llm/thop/thUtils.h" + +#include +#include + +namespace th = torch; +namespace tk = tensorrt_llm::kernels; + +TRTLLM_NAMESPACE_BEGIN + +namespace torch_ext +{ + +namespace +{ + +template +void invokeTypedApplyTokenPenalties(th::Tensor& logits, th::Tensor const& tokenIds, th::Tensor const& penaltyValues) +{ + auto stream = at::cuda::getCurrentCUDAStream(logits.get_device()).stream(); + tk::invokeApplySpeculativeTokenPenalties(reinterpret_cast(logits.data_ptr()), + reinterpret_cast(tokenIds.data_ptr()), reinterpret_cast(penaltyValues.data_ptr()), + static_cast(logits.size(0)), static_cast(tokenIds.size(1)), + static_cast(logits.size(1)), stream); +} + +template +void dispatchTokenType(th::Tensor& logits, th::Tensor const& tokenIds, th::Tensor const& penaltyValues) +{ + switch (tokenIds.scalar_type()) + { + case torch::kInt: + invokeTypedApplyTokenPenalties(logits, tokenIds, penaltyValues); + break; + case torch::kLong: + invokeTypedApplyTokenPenalties(logits, tokenIds, penaltyValues); + break; + default: + TORCH_CHECK(false, "token_ids dtype must be int32 or int64."); + } +} + +template +void invokeTypedApplyCountFrequencyPenalty( + th::Tensor& logits, th::Tensor const& tokenCounts, th::Tensor const& rowSlots, th::Tensor const& frequencyPenalties) +{ + auto stream = at::cuda::getCurrentCUDAStream(logits.get_device()).stream(); + tk::invokeApplySpeculativeCountFrequencyPenalty(reinterpret_cast(logits.data_ptr()), + reinterpret_cast(tokenCounts.data_ptr()), reinterpret_cast(rowSlots.data_ptr()), + reinterpret_cast(frequencyPenalties.data_ptr()), static_cast(logits.size(0)), + static_cast(logits.size(1)), stream); +} + +template +void invokeTypedApplySparseCountFrequencyPenalty(th::Tensor& logits, th::Tensor const& tokenIds, + th::Tensor const& tokenCounts, th::Tensor const& countLens, th::Tensor const& rowSlots, + th::Tensor const& frequencyPenalties) +{ + auto stream = at::cuda::getCurrentCUDAStream(logits.get_device()).stream(); + tk::invokeApplySpeculativeSparseCountFrequencyPenalty(reinterpret_cast(logits.data_ptr()), + reinterpret_cast(tokenIds.data_ptr()), reinterpret_cast(tokenCounts.data_ptr()), + reinterpret_cast(countLens.data_ptr()), reinterpret_cast(rowSlots.data_ptr()), + reinterpret_cast(frequencyPenalties.data_ptr()), static_cast(logits.size(0)), + static_cast(tokenIds.size(1)), static_cast(logits.size(1)), stream); +} + +} // namespace + +void speculativeApplyTokenPenalties(th::Tensor& logits, th::Tensor const& tokenIds, th::Tensor const& penaltyValues) +{ + TORCH_CHECK(logits.is_cuda(), "logits must be a CUDA tensor."); + TORCH_CHECK(logits.is_contiguous(), "logits must be contiguous."); + TORCH_CHECK(logits.dim() == 2, "logits must be a 2D tensor."); + TORCH_CHECK(tokenIds.is_cuda(), "token_ids must be a CUDA tensor."); + TORCH_CHECK(tokenIds.is_contiguous(), "token_ids must be contiguous."); + TORCH_CHECK(tokenIds.dim() == 2, "token_ids must be a 2D tensor."); + TORCH_CHECK(penaltyValues.is_cuda(), "penalty_values must be a CUDA tensor."); + TORCH_CHECK(penaltyValues.is_contiguous(), "penalty_values must be contiguous."); + TORCH_CHECK(penaltyValues.dim() == 2, "penalty_values must be a 2D tensor."); + TORCH_CHECK(penaltyValues.scalar_type() == torch::kFloat, "penalty_values dtype must be float32."); + TORCH_CHECK(tokenIds.size(0) == logits.size(0), "token_ids and logits must have the same row count."); + TORCH_CHECK(penaltyValues.size(0) == logits.size(0), "penalty_values and logits must have the same row count."); + TORCH_CHECK(penaltyValues.size(1) == tokenIds.size(1), "penalty_values and token_ids widths must match."); + + if (logits.size(0) == 0 || tokenIds.size(1) == 0) + { + return; + } + + switch (logits.scalar_type()) + { + case torch::kFloat32: + dispatchTokenType(logits, tokenIds, penaltyValues); + break; + case torch::kFloat16: + dispatchTokenType(logits, tokenIds, penaltyValues); + break; + case torch::kBFloat16: + dispatchTokenType<__nv_bfloat16>(logits, tokenIds, penaltyValues); + break; + default: + TORCH_CHECK(false, "logits dtype must be float32, float16, or bfloat16."); + } +} + +void speculativeApplyHistoryFrequencyPenalty(th::Tensor& logits, th::Tensor const& historyTokens, + th::Tensor const& historyLens, th::Tensor const& rowSlots, th::Tensor const& frequencyPenalties) +{ + TORCH_CHECK(logits.is_cuda(), "logits must be a CUDA tensor."); + TORCH_CHECK(logits.is_contiguous(), "logits must be contiguous."); + TORCH_CHECK(logits.dim() == 2, "logits must be a 2D tensor."); + TORCH_CHECK(logits.scalar_type() == torch::kFloat32, "logits dtype must be float32."); + TORCH_CHECK(historyTokens.is_cuda(), "history_tokens must be a CUDA tensor."); + TORCH_CHECK(historyTokens.is_contiguous(), "history_tokens must be contiguous."); + TORCH_CHECK(historyTokens.dim() == 2, "history_tokens must be a 2D tensor."); + TORCH_CHECK(historyTokens.scalar_type() == torch::kInt, "history_tokens dtype must be int32."); + TORCH_CHECK(historyLens.is_cuda(), "history_lens must be a CUDA tensor."); + TORCH_CHECK(historyLens.is_contiguous(), "history_lens must be contiguous."); + TORCH_CHECK(historyLens.dim() == 1, "history_lens must be a 1D tensor."); + TORCH_CHECK(historyLens.scalar_type() == torch::kInt, "history_lens dtype must be int32."); + TORCH_CHECK(rowSlots.is_cuda(), "row_slots must be a CUDA tensor."); + TORCH_CHECK(rowSlots.is_contiguous(), "row_slots must be contiguous."); + TORCH_CHECK(rowSlots.dim() == 1, "row_slots must be a 1D tensor."); + TORCH_CHECK(rowSlots.scalar_type() == torch::kInt, "row_slots dtype must be int32."); + TORCH_CHECK(frequencyPenalties.is_cuda(), "frequency_penalties must be a CUDA tensor."); + TORCH_CHECK(frequencyPenalties.is_contiguous(), "frequency_penalties must be contiguous."); + TORCH_CHECK(frequencyPenalties.dim() == 1, "frequency_penalties must be a 1D tensor."); + TORCH_CHECK(frequencyPenalties.scalar_type() == torch::kFloat, "frequency_penalties dtype must be float32."); + TORCH_CHECK(rowSlots.size(0) == logits.size(0), "row_slots and logits must have the same row count."); + TORCH_CHECK( + frequencyPenalties.size(0) == logits.size(0), "frequency_penalties and logits must have the same row count."); + TORCH_CHECK(historyLens.size(0) == historyTokens.size(0), "history_lens and history_tokens slot count mismatch."); + + if (logits.size(0) == 0 || historyTokens.size(1) == 0) + { + return; + } + + auto stream = at::cuda::getCurrentCUDAStream(logits.get_device()).stream(); + tk::invokeApplySpeculativeHistoryFrequencyPenalty(reinterpret_cast(logits.data_ptr()), + reinterpret_cast(historyTokens.data_ptr()), reinterpret_cast(historyLens.data_ptr()), + reinterpret_cast(rowSlots.data_ptr()), + reinterpret_cast(frequencyPenalties.data_ptr()), static_cast(logits.size(0)), + static_cast(historyTokens.size(1)), static_cast(logits.size(1)), stream); +} + +void speculativeAppendAcceptedTokens(th::Tensor& historyTokens, th::Tensor& historyLens, th::Tensor const& seqSlots, + th::Tensor const& acceptedTokens, th::Tensor const& acceptedLens) +{ + TORCH_CHECK(historyTokens.is_cuda(), "history_tokens must be a CUDA tensor."); + TORCH_CHECK(historyTokens.is_contiguous(), "history_tokens must be contiguous."); + TORCH_CHECK(historyTokens.dim() == 2, "history_tokens must be a 2D tensor."); + TORCH_CHECK(historyTokens.scalar_type() == torch::kInt, "history_tokens dtype must be int32."); + TORCH_CHECK(historyLens.is_cuda(), "history_lens must be a CUDA tensor."); + TORCH_CHECK(historyLens.is_contiguous(), "history_lens must be contiguous."); + TORCH_CHECK(historyLens.dim() == 1, "history_lens must be a 1D tensor."); + TORCH_CHECK(historyLens.scalar_type() == torch::kInt, "history_lens dtype must be int32."); + TORCH_CHECK(seqSlots.is_cuda(), "seq_slots must be a CUDA tensor."); + TORCH_CHECK(seqSlots.is_contiguous(), "seq_slots must be contiguous."); + TORCH_CHECK(seqSlots.dim() == 1, "seq_slots must be a 1D tensor."); + TORCH_CHECK(seqSlots.scalar_type() == torch::kInt, "seq_slots dtype must be int32."); + TORCH_CHECK(acceptedTokens.is_cuda(), "accepted_tokens must be a CUDA tensor."); + TORCH_CHECK(acceptedTokens.is_contiguous(), "accepted_tokens must be contiguous."); + TORCH_CHECK(acceptedTokens.dim() == 2, "accepted_tokens must be a 2D tensor."); + TORCH_CHECK(acceptedTokens.scalar_type() == torch::kInt, "accepted_tokens dtype must be int32."); + TORCH_CHECK(acceptedLens.is_cuda(), "accepted_lens must be a CUDA tensor."); + TORCH_CHECK(acceptedLens.is_contiguous(), "accepted_lens must be contiguous."); + TORCH_CHECK(acceptedLens.dim() == 1, "accepted_lens must be a 1D tensor."); + TORCH_CHECK(acceptedLens.scalar_type() == torch::kInt, "accepted_lens dtype must be int32."); + TORCH_CHECK(seqSlots.size(0) == acceptedTokens.size(0), "seq_slots and accepted_tokens row count mismatch."); + TORCH_CHECK(acceptedLens.size(0) == acceptedTokens.size(0), "accepted_lens and accepted_tokens row count mismatch."); + TORCH_CHECK(historyLens.size(0) == historyTokens.size(0), "history_lens and history_tokens slot count mismatch."); + + if (acceptedTokens.size(0) == 0 || acceptedTokens.size(1) == 0 || historyTokens.size(1) == 0) + { + return; + } + + auto stream = at::cuda::getCurrentCUDAStream(historyTokens.get_device()).stream(); + tk::invokeAppendSpeculativeAcceptedTokens(reinterpret_cast(historyTokens.data_ptr()), + reinterpret_cast(historyLens.data_ptr()), reinterpret_cast(seqSlots.data_ptr()), + reinterpret_cast(acceptedTokens.data_ptr()), reinterpret_cast(acceptedLens.data_ptr()), + static_cast(acceptedTokens.size(0)), static_cast(acceptedTokens.size(1)), + static_cast(historyTokens.size(1)), stream); +} + +void speculativeApplyCountFrequencyPenalty(th::Tensor& logits, th::Tensor const& tokenCounts, + th::Tensor const& rowSlots, th::Tensor const& frequencyPenalties) +{ + TORCH_CHECK(logits.is_cuda(), "logits must be a CUDA tensor."); + TORCH_CHECK(logits.is_contiguous(), "logits must be contiguous."); + TORCH_CHECK(logits.dim() == 2, "logits must be a 2D tensor."); + TORCH_CHECK(tokenCounts.is_cuda(), "token_counts must be a CUDA tensor."); + TORCH_CHECK(tokenCounts.is_contiguous(), "token_counts must be contiguous."); + TORCH_CHECK(tokenCounts.dim() == 2, "token_counts must be a 2D tensor."); + TORCH_CHECK(tokenCounts.scalar_type() == torch::kInt, "token_counts dtype must be int32."); + TORCH_CHECK(rowSlots.is_cuda(), "row_slots must be a CUDA tensor."); + TORCH_CHECK(rowSlots.is_contiguous(), "row_slots must be contiguous."); + TORCH_CHECK(rowSlots.dim() == 1, "row_slots must be a 1D tensor."); + TORCH_CHECK(rowSlots.scalar_type() == torch::kInt, "row_slots dtype must be int32."); + TORCH_CHECK(frequencyPenalties.is_cuda(), "frequency_penalties must be a CUDA tensor."); + TORCH_CHECK(frequencyPenalties.is_contiguous(), "frequency_penalties must be contiguous."); + TORCH_CHECK(frequencyPenalties.dim() == 1, "frequency_penalties must be a 1D tensor."); + TORCH_CHECK(frequencyPenalties.scalar_type() == torch::kFloat, "frequency_penalties dtype must be float32."); + TORCH_CHECK(rowSlots.size(0) == logits.size(0), "row_slots and logits must have the same row count."); + TORCH_CHECK( + frequencyPenalties.size(0) == logits.size(0), "frequency_penalties and logits must have the same row count."); + TORCH_CHECK(tokenCounts.size(1) == logits.size(1), "token_counts and logits vocab size mismatch."); + + if (logits.size(0) == 0 || logits.size(1) == 0) + { + return; + } + + switch (logits.scalar_type()) + { + case torch::kFloat32: + invokeTypedApplyCountFrequencyPenalty(logits, tokenCounts, rowSlots, frequencyPenalties); + break; + case torch::kFloat16: + invokeTypedApplyCountFrequencyPenalty(logits, tokenCounts, rowSlots, frequencyPenalties); + break; + case torch::kBFloat16: + invokeTypedApplyCountFrequencyPenalty<__nv_bfloat16>(logits, tokenCounts, rowSlots, frequencyPenalties); + break; + default: + TORCH_CHECK(false, "logits dtype must be float32, float16, or bfloat16."); + } +} + +void speculativeAppendAcceptedTokenCounts(th::Tensor& tokenCounts, th::Tensor const& seqSlots, + th::Tensor const& acceptedTokens, th::Tensor const& acceptedLens) +{ + TORCH_CHECK(tokenCounts.is_cuda(), "token_counts must be a CUDA tensor."); + TORCH_CHECK(tokenCounts.is_contiguous(), "token_counts must be contiguous."); + TORCH_CHECK(tokenCounts.dim() == 2, "token_counts must be a 2D tensor."); + TORCH_CHECK(tokenCounts.scalar_type() == torch::kInt, "token_counts dtype must be int32."); + TORCH_CHECK(seqSlots.is_cuda(), "seq_slots must be a CUDA tensor."); + TORCH_CHECK(seqSlots.is_contiguous(), "seq_slots must be contiguous."); + TORCH_CHECK(seqSlots.dim() == 1, "seq_slots must be a 1D tensor."); + TORCH_CHECK(seqSlots.scalar_type() == torch::kInt, "seq_slots dtype must be int32."); + TORCH_CHECK(acceptedTokens.is_cuda(), "accepted_tokens must be a CUDA tensor."); + TORCH_CHECK(acceptedTokens.is_contiguous(), "accepted_tokens must be contiguous."); + TORCH_CHECK(acceptedTokens.dim() == 2, "accepted_tokens must be a 2D tensor."); + TORCH_CHECK(acceptedTokens.scalar_type() == torch::kInt, "accepted_tokens dtype must be int32."); + TORCH_CHECK(acceptedLens.is_cuda(), "accepted_lens must be a CUDA tensor."); + TORCH_CHECK(acceptedLens.is_contiguous(), "accepted_lens must be contiguous."); + TORCH_CHECK(acceptedLens.dim() == 1, "accepted_lens must be a 1D tensor."); + TORCH_CHECK(acceptedLens.scalar_type() == torch::kInt, "accepted_lens dtype must be int32."); + TORCH_CHECK(seqSlots.size(0) == acceptedTokens.size(0), "seq_slots and accepted_tokens row count mismatch."); + TORCH_CHECK(acceptedLens.size(0) == acceptedTokens.size(0), "accepted_lens and accepted_tokens row count mismatch."); + + if (acceptedTokens.size(0) == 0 || acceptedTokens.size(1) == 0 || tokenCounts.size(1) == 0) + { + return; + } + + auto stream = at::cuda::getCurrentCUDAStream(tokenCounts.get_device()).stream(); + tk::invokeAppendSpeculativeAcceptedTokenCounts(reinterpret_cast(tokenCounts.data_ptr()), + reinterpret_cast(seqSlots.data_ptr()), reinterpret_cast(acceptedTokens.data_ptr()), + reinterpret_cast(acceptedLens.data_ptr()), static_cast(acceptedTokens.size(0)), + static_cast(acceptedTokens.size(1)), static_cast(tokenCounts.size(1)), stream); +} + +void speculativeApplySparseCountFrequencyPenalty(th::Tensor& logits, th::Tensor const& tokenIds, + th::Tensor const& tokenCounts, th::Tensor const& countLens, th::Tensor const& rowSlots, + th::Tensor const& frequencyPenalties) +{ + TORCH_CHECK(logits.is_cuda(), "logits must be a CUDA tensor."); + TORCH_CHECK(logits.is_contiguous(), "logits must be contiguous."); + TORCH_CHECK(logits.dim() == 2, "logits must be a 2D tensor."); + TORCH_CHECK(tokenIds.is_cuda(), "token_ids must be a CUDA tensor."); + TORCH_CHECK(tokenIds.is_contiguous(), "token_ids must be contiguous."); + TORCH_CHECK(tokenIds.dim() == 2, "token_ids must be a 2D tensor."); + TORCH_CHECK(tokenIds.scalar_type() == torch::kInt, "token_ids dtype must be int32."); + TORCH_CHECK(tokenCounts.is_cuda(), "token_counts must be a CUDA tensor."); + TORCH_CHECK(tokenCounts.is_contiguous(), "token_counts must be contiguous."); + TORCH_CHECK(tokenCounts.dim() == 2, "token_counts must be a 2D tensor."); + TORCH_CHECK(tokenCounts.scalar_type() == torch::kInt, "token_counts dtype must be int32."); + TORCH_CHECK(countLens.is_cuda(), "count_lens must be a CUDA tensor."); + TORCH_CHECK(countLens.is_contiguous(), "count_lens must be contiguous."); + TORCH_CHECK(countLens.dim() == 1, "count_lens must be a 1D tensor."); + TORCH_CHECK(countLens.scalar_type() == torch::kInt, "count_lens dtype must be int32."); + TORCH_CHECK(rowSlots.is_cuda(), "row_slots must be a CUDA tensor."); + TORCH_CHECK(rowSlots.is_contiguous(), "row_slots must be contiguous."); + TORCH_CHECK(rowSlots.dim() == 1, "row_slots must be a 1D tensor."); + TORCH_CHECK(rowSlots.scalar_type() == torch::kInt, "row_slots dtype must be int32."); + TORCH_CHECK(frequencyPenalties.is_cuda(), "frequency_penalties must be a CUDA tensor."); + TORCH_CHECK(frequencyPenalties.is_contiguous(), "frequency_penalties must be contiguous."); + TORCH_CHECK(frequencyPenalties.dim() == 1, "frequency_penalties must be a 1D tensor."); + TORCH_CHECK(frequencyPenalties.scalar_type() == torch::kFloat, "frequency_penalties dtype must be float32."); + TORCH_CHECK(tokenIds.size(0) == tokenCounts.size(0), "token_ids and token_counts slot count mismatch."); + TORCH_CHECK(tokenIds.size(1) == tokenCounts.size(1), "token_ids and token_counts capacity mismatch."); + TORCH_CHECK(countLens.size(0) == tokenIds.size(0), "count_lens and token_ids slot count mismatch."); + TORCH_CHECK(rowSlots.size(0) == logits.size(0), "row_slots and logits must have the same row count."); + TORCH_CHECK( + frequencyPenalties.size(0) == logits.size(0), "frequency_penalties and logits must have the same row count."); + + if (logits.size(0) == 0 || tokenIds.size(1) == 0 || logits.size(1) == 0) + { + return; + } + + switch (logits.scalar_type()) + { + case torch::kFloat32: + invokeTypedApplySparseCountFrequencyPenalty( + logits, tokenIds, tokenCounts, countLens, rowSlots, frequencyPenalties); + break; + case torch::kFloat16: + invokeTypedApplySparseCountFrequencyPenalty( + logits, tokenIds, tokenCounts, countLens, rowSlots, frequencyPenalties); + break; + case torch::kBFloat16: + invokeTypedApplySparseCountFrequencyPenalty<__nv_bfloat16>( + logits, tokenIds, tokenCounts, countLens, rowSlots, frequencyPenalties); + break; + default: + TORCH_CHECK(false, "logits dtype must be float32, float16, or bfloat16."); + } +} + +void speculativeAppendSparseTokenCounts(th::Tensor& tokenIds, th::Tensor& tokenCounts, th::Tensor& countLens, + th::Tensor const& seqSlots, th::Tensor const& acceptedTokens, th::Tensor const& acceptedLens, int64_t vocabSize) +{ + TORCH_CHECK(tokenIds.is_cuda(), "token_ids must be a CUDA tensor."); + TORCH_CHECK(tokenIds.is_contiguous(), "token_ids must be contiguous."); + TORCH_CHECK(tokenIds.dim() == 2, "token_ids must be a 2D tensor."); + TORCH_CHECK(tokenIds.scalar_type() == torch::kInt, "token_ids dtype must be int32."); + TORCH_CHECK(tokenCounts.is_cuda(), "token_counts must be a CUDA tensor."); + TORCH_CHECK(tokenCounts.is_contiguous(), "token_counts must be contiguous."); + TORCH_CHECK(tokenCounts.dim() == 2, "token_counts must be a 2D tensor."); + TORCH_CHECK(tokenCounts.scalar_type() == torch::kInt, "token_counts dtype must be int32."); + TORCH_CHECK(countLens.is_cuda(), "count_lens must be a CUDA tensor."); + TORCH_CHECK(countLens.is_contiguous(), "count_lens must be contiguous."); + TORCH_CHECK(countLens.dim() == 1, "count_lens must be a 1D tensor."); + TORCH_CHECK(countLens.scalar_type() == torch::kInt, "count_lens dtype must be int32."); + TORCH_CHECK(seqSlots.is_cuda(), "seq_slots must be a CUDA tensor."); + TORCH_CHECK(seqSlots.is_contiguous(), "seq_slots must be contiguous."); + TORCH_CHECK(seqSlots.dim() == 1, "seq_slots must be a 1D tensor."); + TORCH_CHECK(seqSlots.scalar_type() == torch::kInt, "seq_slots dtype must be int32."); + TORCH_CHECK(acceptedTokens.is_cuda(), "accepted_tokens must be a CUDA tensor."); + TORCH_CHECK(acceptedTokens.is_contiguous(), "accepted_tokens must be contiguous."); + TORCH_CHECK(acceptedTokens.dim() == 2, "accepted_tokens must be a 2D tensor."); + TORCH_CHECK(acceptedTokens.scalar_type() == torch::kInt, "accepted_tokens dtype must be int32."); + TORCH_CHECK(acceptedLens.is_cuda(), "accepted_lens must be a CUDA tensor."); + TORCH_CHECK(acceptedLens.is_contiguous(), "accepted_lens must be contiguous."); + TORCH_CHECK(acceptedLens.dim() == 1, "accepted_lens must be a 1D tensor."); + TORCH_CHECK(acceptedLens.scalar_type() == torch::kInt, "accepted_lens dtype must be int32."); + TORCH_CHECK(tokenIds.size(0) == tokenCounts.size(0), "token_ids and token_counts slot count mismatch."); + TORCH_CHECK(tokenIds.size(1) == tokenCounts.size(1), "token_ids and token_counts capacity mismatch."); + TORCH_CHECK(countLens.size(0) == tokenIds.size(0), "count_lens and token_ids slot count mismatch."); + TORCH_CHECK(seqSlots.size(0) == acceptedTokens.size(0), "seq_slots and accepted_tokens row count mismatch."); + TORCH_CHECK(acceptedLens.size(0) == acceptedTokens.size(0), "accepted_lens and accepted_tokens row count mismatch."); + TORCH_CHECK(vocabSize > 0, "vocab_size must be positive."); + + if (acceptedTokens.size(0) == 0 || acceptedTokens.size(1) == 0 || tokenIds.size(1) == 0) + { + return; + } + + auto stream = at::cuda::getCurrentCUDAStream(tokenIds.get_device()).stream(); + tk::invokeAppendSpeculativeSparseTokenCounts(reinterpret_cast(tokenIds.data_ptr()), + reinterpret_cast(tokenCounts.data_ptr()), reinterpret_cast(countLens.data_ptr()), + reinterpret_cast(seqSlots.data_ptr()), reinterpret_cast(acceptedTokens.data_ptr()), + reinterpret_cast(acceptedLens.data_ptr()), static_cast(acceptedTokens.size(0)), + static_cast(acceptedTokens.size(1)), static_cast(tokenIds.size(1)), + static_cast(vocabSize), stream); +} + +void speculativeInitSparseTokenCounts(th::Tensor& tokenIds, th::Tensor& tokenCounts, th::Tensor& countLens, + th::Tensor const& promptTokenIds, th::Tensor const& promptTokenCounts, th::Tensor const& promptLens, + th::Tensor const& seqSlots, int64_t vocabSize) +{ + TORCH_CHECK(tokenIds.is_cuda(), "token_ids must be a CUDA tensor."); + TORCH_CHECK(tokenIds.is_contiguous(), "token_ids must be contiguous."); + TORCH_CHECK(tokenIds.dim() == 2, "token_ids must be a 2D tensor."); + TORCH_CHECK(tokenIds.scalar_type() == torch::kInt, "token_ids dtype must be int32."); + TORCH_CHECK(tokenCounts.is_cuda(), "token_counts must be a CUDA tensor."); + TORCH_CHECK(tokenCounts.is_contiguous(), "token_counts must be contiguous."); + TORCH_CHECK(tokenCounts.dim() == 2, "token_counts must be a 2D tensor."); + TORCH_CHECK(tokenCounts.scalar_type() == torch::kInt, "token_counts dtype must be int32."); + TORCH_CHECK(countLens.is_cuda(), "count_lens must be a CUDA tensor."); + TORCH_CHECK(countLens.is_contiguous(), "count_lens must be contiguous."); + TORCH_CHECK(countLens.dim() == 1, "count_lens must be a 1D tensor."); + TORCH_CHECK(countLens.scalar_type() == torch::kInt, "count_lens dtype must be int32."); + TORCH_CHECK(promptTokenIds.is_cuda(), "prompt_token_ids must be a CUDA tensor."); + TORCH_CHECK(promptTokenIds.is_contiguous(), "prompt_token_ids must be contiguous."); + TORCH_CHECK(promptTokenIds.dim() == 2, "prompt_token_ids must be a 2D tensor."); + TORCH_CHECK(promptTokenIds.scalar_type() == torch::kInt, "prompt_token_ids dtype must be int32."); + TORCH_CHECK(promptTokenCounts.is_cuda(), "prompt_token_counts must be a CUDA tensor."); + TORCH_CHECK(promptTokenCounts.is_contiguous(), "prompt_token_counts must be contiguous."); + TORCH_CHECK(promptTokenCounts.dim() == 2, "prompt_token_counts must be a 2D tensor."); + TORCH_CHECK(promptTokenCounts.scalar_type() == torch::kInt, "prompt_token_counts dtype must be int32."); + TORCH_CHECK(promptLens.is_cuda(), "prompt_lens must be a CUDA tensor."); + TORCH_CHECK(promptLens.is_contiguous(), "prompt_lens must be contiguous."); + TORCH_CHECK(promptLens.dim() == 1, "prompt_lens must be a 1D tensor."); + TORCH_CHECK(promptLens.scalar_type() == torch::kInt, "prompt_lens dtype must be int32."); + TORCH_CHECK(seqSlots.is_cuda(), "seq_slots must be a CUDA tensor."); + TORCH_CHECK(seqSlots.is_contiguous(), "seq_slots must be contiguous."); + TORCH_CHECK(seqSlots.dim() == 1, "seq_slots must be a 1D tensor."); + TORCH_CHECK(seqSlots.scalar_type() == torch::kInt, "seq_slots dtype must be int32."); + TORCH_CHECK(tokenIds.size(0) == tokenCounts.size(0), "token_ids and token_counts slot count mismatch."); + TORCH_CHECK(tokenIds.size(1) == tokenCounts.size(1), "token_ids and token_counts capacity mismatch."); + TORCH_CHECK(countLens.size(0) == tokenIds.size(0), "count_lens and token_ids slot count mismatch."); + TORCH_CHECK(promptTokenIds.size(0) == promptTokenCounts.size(0), + "prompt_token_ids and prompt_token_counts row count mismatch."); + TORCH_CHECK(promptTokenIds.size(1) == promptTokenCounts.size(1), + "prompt_token_ids and prompt_token_counts capacity mismatch."); + TORCH_CHECK(promptLens.size(0) == promptTokenIds.size(0), "prompt_lens and prompt_token_ids row count mismatch."); + TORCH_CHECK(seqSlots.size(0) == promptTokenIds.size(0), "seq_slots and prompt_token_ids row count mismatch."); + TORCH_CHECK(vocabSize > 0, "vocab_size must be positive."); + + if (promptTokenIds.size(0) == 0 || promptTokenIds.size(1) == 0 || tokenIds.size(1) == 0) + { + return; + } + + auto stream = at::cuda::getCurrentCUDAStream(tokenIds.get_device()).stream(); + tk::invokeInitSpeculativeSparseTokenCounts(reinterpret_cast(tokenIds.data_ptr()), + reinterpret_cast(tokenCounts.data_ptr()), reinterpret_cast(countLens.data_ptr()), + reinterpret_cast(promptTokenIds.data_ptr()), + reinterpret_cast(promptTokenCounts.data_ptr()), + reinterpret_cast(promptLens.data_ptr()), reinterpret_cast(seqSlots.data_ptr()), + static_cast(promptTokenIds.size(0)), static_cast(promptTokenIds.size(1)), + static_cast(tokenIds.size(1)), static_cast(vocabSize), stream); +} + +} // namespace torch_ext + +TRTLLM_NAMESPACE_END + +TORCH_LIBRARY_FRAGMENT(trtllm, m) +{ + m.def("speculative_apply_token_penalties(Tensor(a!) logits, Tensor token_ids, Tensor penalty_values) -> ()"); + m.def( + "speculative_apply_history_frequency_penalty(Tensor(a!) logits, Tensor history_tokens, Tensor history_lens, Tensor row_slots, Tensor frequency_penalties) -> ()"); + m.def( + "speculative_append_accepted_tokens(Tensor(a!) history_tokens, Tensor(b!) history_lens, Tensor seq_slots, Tensor accepted_tokens, Tensor accepted_lens) -> ()"); + m.def( + "speculative_apply_count_frequency_penalty(Tensor(a!) logits, Tensor token_counts, Tensor row_slots, Tensor frequency_penalties) -> ()"); + m.def( + "speculative_append_accepted_token_counts(Tensor(a!) token_counts, Tensor seq_slots, Tensor accepted_tokens, Tensor accepted_lens) -> ()"); + m.def( + "speculative_apply_sparse_count_frequency_penalty(Tensor(a!) logits, Tensor token_ids, Tensor token_counts, Tensor count_lens, Tensor row_slots, Tensor frequency_penalties) -> ()"); + m.def( + "speculative_append_sparse_token_counts(Tensor(a!) token_ids, Tensor(b!) token_counts, Tensor(c!) count_lens, Tensor seq_slots, Tensor accepted_tokens, Tensor accepted_lens, int vocab_size) -> ()"); + m.def( + "speculative_init_sparse_token_counts(Tensor(a!) token_ids, Tensor(b!) token_counts, Tensor(c!) count_lens, Tensor prompt_token_ids, Tensor prompt_token_counts, Tensor prompt_lens, Tensor seq_slots, int vocab_size) -> ()"); +} + +TORCH_LIBRARY_IMPL(trtllm, CUDA, m) +{ + m.impl("speculative_apply_token_penalties", &tensorrt_llm::torch_ext::speculativeApplyTokenPenalties); + m.impl("speculative_apply_history_frequency_penalty", + &tensorrt_llm::torch_ext::speculativeApplyHistoryFrequencyPenalty); + m.impl("speculative_append_accepted_tokens", &tensorrt_llm::torch_ext::speculativeAppendAcceptedTokens); + m.impl("speculative_apply_count_frequency_penalty", + &tensorrt_llm::torch_ext::speculativeApplyCountFrequencyPenalty); + m.impl("speculative_append_accepted_token_counts", + &tensorrt_llm::torch_ext::speculativeAppendAcceptedTokenCounts); + m.impl("speculative_apply_sparse_count_frequency_penalty", + &tensorrt_llm::torch_ext::speculativeApplySparseCountFrequencyPenalty); + m.impl("speculative_append_sparse_token_counts", + &tensorrt_llm::torch_ext::speculativeAppendSparseTokenCounts); + m.impl("speculative_init_sparse_token_counts", &tensorrt_llm::torch_ext::speculativeInitSparseTokenCounts); +} diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index a32734bd599..cea5691e222 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -332,6 +332,9 @@ def __init__( self.is_warmup = False self.previous_request_ids = [] + self.previous_device_sampled_request_ids: set[int] = set() + self.debug_spec_device_draft_guard = ( + os.environ.get("TRTLLM_SPEC_DRAFT_GUARD_DEBUG", "0") == "1") self.has_previous_device_draft = False self.previous_accepted_tokens_cuda = torch.empty((self.batch_size, ), dtype=torch.int, @@ -1410,7 +1413,8 @@ def _set_up_spec_metadata( max_num_tokens=self.max_num_tokens, spec_resource_manager=spec_resource_manager, is_draft_model=self.is_draft_model, - max_seq_len=self.max_seq_len) + max_seq_len=self.max_seq_len, + max_num_sequence_slots=self.get_max_num_sequences()) if self.spec_metadata is not None: return self.spec_metadata @@ -1421,7 +1425,8 @@ def _set_up_spec_metadata( max_num_tokens=self.max_num_tokens, spec_resource_manager=spec_resource_manager, is_draft_model=self.is_draft_model, - max_seq_len=self.max_seq_len) + max_seq_len=self.max_seq_len, + max_num_sequence_slots=self.get_max_num_sequences()) return self.spec_metadata def __del__(self) -> None: @@ -2374,13 +2379,27 @@ def _prepare_tp_inputs( extend_dummy_requests = [] generation_requests = [] first_draft_requests = [] + previous_device_sampled_request_ids = self.previous_device_sampled_request_ids # Collect generation request IDs during categorization to avoid # a separate iteration over scheduled_requests.generation_requests later. all_gen_request_ids = [] for request in scheduled_requests.generation_requests: all_gen_request_ids.append(request.py_request_id) - if get_draft_token_length( - request) > 0 or next_draft_tokens_device is not None: + has_previous_device_draft = ( + next_draft_tokens_device is not None + and request.py_batch_idx is not None + and request.py_request_id in previous_device_sampled_request_ids) + if (self.debug_spec_device_draft_guard + and next_draft_tokens_device is not None + and request.py_batch_idx is not None + and request.py_request_id + not in previous_device_sampled_request_ids): + logger.info( + "Ignoring stale speculative device draft for request_id=%s " + "prev_seq_slot=%s current_seq_slot=%s", + request.py_request_id, request.py_batch_idx, + request.py_seq_slot) + if get_draft_token_length(request) > 0 or has_previous_device_draft: if request.is_dummy: extend_dummy_requests.append(request) else: @@ -2416,7 +2435,11 @@ def _prepare_tp_inputs( # (1) next_draft_tokens_device is None, which means overlap scheduler is disabled; or # (2) a dummy request; or # (3) the first step in the generation server of disaggregated serving - if next_draft_tokens_device is None or request.is_dummy or request.py_batch_idx is None: + has_previous_device_draft = ( + next_draft_tokens_device is not None + and request.py_batch_idx is not None + and request.py_request_id in previous_device_sampled_request_ids) + if not has_previous_device_draft or request.is_dummy: # get token ids, including input token ids and draft token ids. For these dummy requests, # no need to copy the token ids. if not (request.is_attention_dp_dummy @@ -3069,6 +3092,9 @@ def previous_seq_slots_device(): if spec_metadata is not None: total_draft_lens = sum(draft_lens) + spec_sampling_requests = ( + scheduled_requests.context_requests + extend_requests + + first_draft_requests + generation_requests) spec_metadata.draft_tokens = self.draft_tokens_cuda[: total_draft_lens] spec_metadata.request_ids = request_ids @@ -3077,13 +3103,23 @@ def previous_seq_slots_device(): scheduled_requests.generation_requests) spec_metadata.num_tokens = total_num_tokens spec_metadata.seq_lens = sequence_lengths + spec_metadata.sampling_request_ids = [ + int(request.py_request_id) for request in spec_sampling_requests + ] + spec_metadata.sampling_seq_slots = [ + int(request.py_seq_slot) + if request.py_seq_slot is not None else -1 + for request in spec_sampling_requests + ] spec_metadata.num_accepted_draft_tokens = self.num_accepted_draft_tokens_cuda[:len( num_accepted_draft_tokens)] if isinstance(spec_metadata, Eagle3SpecMetadata): spec_metadata.request_accepted_path = request_accepted_path # No-op for non 1-model spec_metadata.populate_sampling_params_for_one_model( - scheduled_requests.all_requests()) + spec_sampling_requests) + spec_metadata.prepare_device_penalty_counts( + int(self.model.config.vocab_size)) spec_metadata.prepare() inputs['spec_metadata'] = spec_metadata @@ -3116,6 +3152,13 @@ def previous_seq_slots_device(): if not self.is_warmup: self.previous_request_ids = all_gen_request_ids + self.previous_device_sampled_request_ids = { + request.py_request_id + for request in ( + scheduled_requests.context_requests_last_chunk + + scheduled_requests.generation_requests) + if not request.is_dummy + } self.has_previous_device_draft = next_draft_tokens_device is not None return inputs, self.gather_ids_cuda[:len( @@ -3871,9 +3914,68 @@ def capture_postprocess_fn(inputs: Dict[str, Any]): self.forward_pass_callable() self._execute_logit_post_processors(scheduled_requests, outputs) + self._attach_spec_penalty_outputs(outputs, + inputs.get("spec_metadata")) return outputs + @staticmethod + def _attach_spec_penalty_outputs(outputs: Dict[str, Any], + spec_metadata: Any) -> None: + if not isinstance(outputs, dict) or spec_metadata is None: + return + + sampling_request_ids = getattr(spec_metadata, "sampling_request_ids", + None) + if sampling_request_ids is not None: + outputs["penalty_sampling_request_ids"] = sampling_request_ids + sampling_seq_slots = getattr(spec_metadata, "sampling_seq_slots", None) + if sampling_seq_slots is not None: + outputs["penalty_sampling_seq_slots"] = sampling_seq_slots + + if getattr(spec_metadata, "use_device_penalty_counts", False): + count_seq_slots = getattr(spec_metadata, + "device_penalty_count_seq_slots", None) + if count_seq_slots is not None: + outputs["penalty_count_seq_slots"] = count_seq_slots + count_mode = getattr(spec_metadata, "device_penalty_count_mode", + "") + if (count_mode == "dense" + and getattr(spec_metadata, "device_penalty_token_counts", + None) is not None): + outputs[ + "penalty_token_counts"] = spec_metadata.device_penalty_token_counts + elif (count_mode == "sparse" + and getattr(spec_metadata, "device_penalty_sparse_token_ids", + None) is not None + and getattr(spec_metadata, + "device_penalty_sparse_token_counts", + None) is not None + and getattr(spec_metadata, "device_penalty_sparse_count_lens", + None) is not None): + outputs[ + "penalty_sparse_token_ids"] = spec_metadata.device_penalty_sparse_token_ids + outputs[ + "penalty_sparse_token_counts"] = spec_metadata.device_penalty_sparse_token_counts + outputs[ + "penalty_sparse_count_lens"] = spec_metadata.device_penalty_sparse_count_lens + outputs["penalty_count_vocab_size"] = getattr( + spec_metadata, "device_penalty_count_vocab_size", 0) + + if (getattr(spec_metadata, "use_device_penalty_history", False) + and getattr(spec_metadata, "device_penalty_history_tokens", + None) is not None + and getattr(spec_metadata, "device_penalty_history_lens", + None) is not None): + history_seq_slots = getattr(spec_metadata, + "device_penalty_seq_slots", None) + if history_seq_slots is not None: + outputs["penalty_history_seq_slots"] = history_seq_slots + outputs[ + "penalty_history_tokens"] = spec_metadata.device_penalty_history_tokens + outputs[ + "penalty_history_lens"] = spec_metadata.device_penalty_history_lens + def model_forward(self, **kwargs): attrs = get_model_extra_attrs() assert attrs is not None, "Model extra attrs is not set" diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py index f4dad9d8f4e..d7094b5c8d2 100644 --- a/tensorrt_llm/_torch/speculative/interface.py +++ b/tensorrt_llm/_torch/speculative/interface.py @@ -1,5 +1,6 @@ import copy import os +from collections import Counter from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, field @@ -20,6 +21,7 @@ if TYPE_CHECKING: from ..pyexecutor.guided_decoder import CapturableGuidedDecoder + from ..pyexecutor.llm_request import LlmRequest if IS_FLASHINFER_AVAILABLE: import flashinfer @@ -328,6 +330,9 @@ class SpecMetadata: max_draft_len: int # The max number of draft tokens for the static tree and dynamic tree . max_total_draft_tokens: int + # Capacity for persistent sequence-slot indexed state. This can be larger + # than max_num_requests when the executor has multiple sequence slots. + max_num_sequence_slots: Optional[int] = None # The number of gen-phase sequences in the batch. num_generations: int = 0 # Whether CUDA graph is enabled. @@ -384,6 +389,73 @@ class SpecMetadata: temperatures: Optional[torch.Tensor] = None top_ks: Optional[torch.Tensor] = None top_ps: Optional[torch.Tensor] = None + recent_penalty_token_ids: Optional[torch.Tensor] = field(default=None, + repr=False) + recent_penalty_values: Optional[torch.Tensor] = field(default=None, + repr=False) + recent_seq_penalty_token_ids: Optional[torch.Tensor] = field(default=None, + repr=False) + recent_seq_penalty_values: Optional[torch.Tensor] = field(default=None, + repr=False) + draft_prefix_penalty_token_ids: Optional[torch.Tensor] = field( + default=None, repr=False) + draft_prefix_penalty_values: Optional[torch.Tensor] = field(default=None, + repr=False) + draft_prefix_penalty_rows: Optional[torch.Tensor] = field(default=None, + repr=False) + device_penalty_history_tokens: Optional[torch.Tensor] = field( + default=None, repr=False) + device_penalty_history_lens: Optional[torch.Tensor] = field(default=None, + repr=False) + device_penalty_row_slots: Optional[torch.Tensor] = field(default=None, + repr=False) + device_penalty_seq_slots: Optional[torch.Tensor] = field(default=None, + repr=False) + device_frequency_penalties: Optional[torch.Tensor] = field(default=None, + repr=False) + device_seq_frequency_penalties: Optional[torch.Tensor] = field( + default=None, repr=False) + device_penalty_history_capacity: int = 0 + use_device_penalty_history: bool = False + device_penalty_token_counts: Optional[torch.Tensor] = field(default=None, + repr=False) + device_penalty_sparse_token_ids: Optional[torch.Tensor] = field( + default=None, repr=False) + device_penalty_sparse_token_counts: Optional[torch.Tensor] = field( + default=None, repr=False) + device_penalty_sparse_count_lens: Optional[torch.Tensor] = field( + default=None, repr=False) + device_penalty_count_row_slots: Optional[torch.Tensor] = field( + default=None, repr=False) + device_penalty_count_seq_slots: Optional[torch.Tensor] = field( + default=None, repr=False) + device_count_frequency_penalties: Optional[torch.Tensor] = field( + default=None, repr=False) + device_count_seq_frequency_penalties: Optional[torch.Tensor] = field( + default=None, repr=False) + device_penalty_count_reset_slots: Optional[torch.Tensor] = field( + default=None, repr=False) + device_penalty_count_reset_count: int = 0 + device_penalty_count_prompt_tokens: Optional[torch.Tensor] = field( + default=None, repr=False) + device_penalty_count_prompt_token_counts: Optional[torch.Tensor] = field( + default=None, repr=False) + device_penalty_count_prompt_lens: Optional[torch.Tensor] = field( + default=None, repr=False) + device_penalty_count_prompt_seq_slots: Optional[torch.Tensor] = field( + default=None, repr=False) + device_penalty_count_prompt_count: int = 0 + device_penalty_count_prompt_capacity: int = 0 + device_penalty_sparse_count_capacity: int = 0 + device_penalty_count_vocab_size: int = 0 + device_penalty_count_mode: str = "dense" + use_device_penalty_counts: bool = False + device_penalty_count_slot_request_ids: dict[int, int] = field( + default_factory=dict, repr=False) + cuda_graph_source_metadata: Optional[object] = field(default=None, + repr=False) + sampling_request_ids: Optional[list[int]] = field(default=None, repr=False) + sampling_seq_slots: Optional[list[int]] = field(default=None, repr=False) def __post_init__(self): pass @@ -403,9 +475,30 @@ def create_cuda_graph_metadata(self, max_batch_size: int): cuda_graph_metadata = copy.copy(self) cuda_graph_metadata.is_cuda_graph = True cuda_graph_metadata.max_num_requests = max_batch_size + cuda_graph_metadata.cuda_graph_source_metadata = self + cuda_graph_metadata.device_penalty_count_slot_request_ids = ( + self.device_penalty_count_slot_request_ids) + cuda_graph_metadata._sync_device_penalty_count_state_from_owner() cuda_graph_metadata.__post_init__() return cuda_graph_metadata + def _device_penalty_count_state_owner(self): + return self.cuda_graph_source_metadata or self + + def _sync_device_penalty_count_state_from_owner(self) -> None: + owner = self._device_penalty_count_state_owner() + if owner is self: + return + for name in ( + "device_penalty_token_counts", + "device_penalty_sparse_token_ids", + "device_penalty_sparse_token_counts", + "device_penalty_sparse_count_lens", + "device_penalty_sparse_count_capacity", + "device_penalty_count_vocab_size", + ): + setattr(self, name, getattr(owner, name)) + def is_layer_capture(self, layer_id: int): """ Whether the layer should be captured (eg for Eagle3). @@ -421,6 +514,850 @@ def maybe_capture_hidden_states(self, layer_id: int, model. Use this method to record them. By default, does nothing. """ + @staticmethod + def _sampling_config_value(config, name: str, default): + value = getattr(config, name, None) + if value is None: + return default + if isinstance(value, torch.Tensor): + if value.numel() == 0: + return default + return value.flatten()[0].item() + if isinstance(value, (list, tuple)): + if len(value) == 0: + return default + value = value[0] + return default if value is None else value + + @staticmethod + def _effective_prompt_ignore_length(request: "LlmRequest", + prompt_ignore_length: int) -> int: + prompt_len = getattr(request, "py_orig_prompt_len", None) + if prompt_len is None: + prompt_len = getattr(request, "orig_prompt_len", None) + if prompt_len is None: + prompt_len = getattr(request, "py_prompt_len", None) + if prompt_len is None: + prompt_len = getattr(request, "prompt_len", 0) + return min(max(prompt_ignore_length, 0), max(int(prompt_len), 0)) + + @staticmethod + def _prompt_len(request: "LlmRequest") -> int: + for attr in ("py_orig_prompt_len", "orig_prompt_len", "py_prompt_len", + "prompt_len"): + value = getattr(request, attr, None) + if value is not None: + return max(int(value), 0) + return 0 + + def _valid_seq_slot(self, slot: int) -> bool: + return 0 <= slot < self._max_num_sequence_slots() + + def _max_num_sequence_slots(self) -> int: + max_num_sequence_slots = self.max_num_sequence_slots + if max_num_sequence_slots is None or max_num_sequence_slots <= 0: + return self.max_num_requests + return max(int(max_num_sequence_slots), self.max_num_requests) + + @staticmethod + def _env_bool(value: Optional[str]) -> Optional[bool]: + if value is None: + return None + normalized = value.strip().lower() + if normalized in ("", "auto"): + return None + if normalized in ("1", "true", "yes", "on"): + return True + if normalized in ("0", "false", "no", "off"): + return False + return None + + @staticmethod + def _is_disagg_generation_role() -> bool: + role = os.environ.get("TRTLLM_DISAGG_ROLE", "").strip().lower() + if role in ("generation", "gen", "decode"): + return True + return os.environ.get("TRTLLM_DISAGG_BENCHMARK_GEN_ONLY") == "1" + + def _force_graph_count_path_enabled(self) -> bool: + override = self._env_bool( + os.environ.get("TRTLLM_SPEC_FORCE_GRAPH_COUNT_PATH")) + if override is not None: + return self.is_cuda_graph and override + return self.is_cuda_graph and self._is_disagg_generation_role() + + def _ensure_recent_penalty_buffers(self, width: int) -> None: + max_rows = (self.max_draft_len + 1) * self.max_num_requests + max_seqs = self.max_num_requests + needs_alloc = ( + self.recent_penalty_token_ids is None + or self.recent_penalty_values is None + or self.recent_seq_penalty_token_ids is None + or self.recent_seq_penalty_values is None + or self.recent_penalty_token_ids.shape != (max_rows, width) + or self.recent_seq_penalty_token_ids.shape != (max_seqs, width)) + if not needs_alloc: + return + + self.recent_penalty_token_ids = torch.zeros((max_rows, width), + dtype=torch.long, + device="cuda") + self.recent_penalty_values = torch.zeros((max_rows, width), + dtype=torch.float32, + device="cuda") + self.recent_seq_penalty_token_ids = torch.zeros((max_seqs, width), + dtype=torch.long, + device="cuda") + self.recent_seq_penalty_values = torch.zeros((max_seqs, width), + dtype=torch.float32, + device="cuda") + + def _ensure_draft_prefix_penalty_buffers(self, width: int) -> None: + max_rows = (self.max_draft_len + 1) * self.max_num_requests + needs_alloc = ( + self.draft_prefix_penalty_token_ids is None + or self.draft_prefix_penalty_values is None + or self.draft_prefix_penalty_rows is None + or self.draft_prefix_penalty_token_ids.shape != (max_rows, width)) + if not needs_alloc: + return + + self.draft_prefix_penalty_token_ids = torch.zeros((max_rows, width), + dtype=torch.long, + device="cuda") + self.draft_prefix_penalty_values = torch.zeros((max_rows, width), + dtype=torch.float32, + device="cuda") + self.draft_prefix_penalty_rows = torch.arange(max_rows, + dtype=torch.long, + device="cuda") + + def _ensure_device_penalty_history_buffers(self) -> None: + max_rows = (self.max_draft_len + 1) * self.max_num_requests + slot_capacity = self._max_num_sequence_slots() + capacity = int( + os.environ.get("TRTLLM_SPEC_PENALTY_HISTORY_TOKENS", "16384")) + capacity = max(capacity, 0) + if capacity == 0: + self.use_device_penalty_history = False + return + + needs_alloc = ( + self.device_penalty_history_tokens is None + or self.device_penalty_history_lens is None + or self.device_penalty_row_slots is None + or self.device_penalty_seq_slots is None + or self.device_frequency_penalties is None + or self.device_seq_frequency_penalties is None + or self.device_penalty_history_tokens.shape != + (slot_capacity, capacity) + or self.device_penalty_row_slots.shape != (max_rows, )) + if not needs_alloc: + return + + self.device_penalty_history_capacity = capacity + self.device_penalty_history_tokens = torch.zeros( + (slot_capacity, capacity), dtype=torch.int32, device="cuda") + self.device_penalty_history_lens = torch.zeros( + (slot_capacity, ), dtype=torch.int32, device="cuda") + self.device_penalty_row_slots = torch.zeros((max_rows, ), + dtype=torch.int32, + device="cuda") + self.device_penalty_seq_slots = torch.zeros((self.max_num_requests, ), + dtype=torch.int32, + device="cuda") + self.device_frequency_penalties = torch.zeros((max_rows, ), + dtype=torch.float32, + device="cuda") + self.device_seq_frequency_penalties = torch.zeros( + (self.max_num_requests, ), dtype=torch.float32, device="cuda") + + def _ensure_device_penalty_count_metadata_buffers(self) -> None: + max_rows = (self.max_draft_len + 1) * self.max_num_requests + needs_alloc = ( + self.device_penalty_count_row_slots is None + or self.device_penalty_count_seq_slots is None + or self.device_count_frequency_penalties is None + or self.device_count_seq_frequency_penalties is None + or self.device_penalty_count_reset_slots is None + or self.device_penalty_count_row_slots.shape != (max_rows, )) + if not needs_alloc: + return + + self.device_penalty_count_row_slots = torch.zeros((max_rows, ), + dtype=torch.int32, + device="cuda") + self.device_penalty_count_seq_slots = torch.zeros( + (self.max_num_requests, ), dtype=torch.int32, device="cuda") + self.device_count_frequency_penalties = torch.zeros((max_rows, ), + dtype=torch.float32, + device="cuda") + self.device_count_seq_frequency_penalties = torch.zeros( + (self.max_num_requests, ), dtype=torch.float32, device="cuda") + self.device_penalty_count_reset_slots = torch.zeros( + (self.max_num_requests, ), dtype=torch.int64, device="cuda") + + def ensure_device_penalty_count_buffers(self, vocab_size: int) -> None: + if vocab_size <= 0: + self.use_device_penalty_counts = False + return + owner = self._device_penalty_count_state_owner() + if owner is not self: + owner.device_penalty_count_mode = self.device_penalty_count_mode + owner.use_device_penalty_counts = self.use_device_penalty_counts + owner.ensure_device_penalty_count_buffers(vocab_size) + self._sync_device_penalty_count_state_from_owner() + return + + slot_capacity = self._max_num_sequence_slots() + if self.device_penalty_count_mode == "dense": + if (self.device_penalty_token_counts is not None + and self.device_penalty_count_vocab_size == vocab_size + and self.device_penalty_token_counts.shape == + (slot_capacity, vocab_size)): + return + + self.device_penalty_count_vocab_size = vocab_size + self.device_penalty_token_counts = torch.zeros( + (slot_capacity, vocab_size), + dtype=torch.int32, + device="cuda") + return + + capacity_env = os.environ.get("TRTLLM_SPEC_SPARSE_COUNT_CAPACITY", + "").strip() + capacity = int(capacity_env) if capacity_env else 0 + if capacity <= 0: + capacity = vocab_size + else: + capacity = min(capacity, vocab_size) + if (self.device_penalty_sparse_token_ids is not None + and self.device_penalty_sparse_token_counts is not None + and self.device_penalty_sparse_count_lens is not None + and self.device_penalty_count_vocab_size == vocab_size + and self.device_penalty_sparse_count_capacity == capacity + and self.device_penalty_sparse_token_ids.shape == + (slot_capacity, capacity)): + return + + self.device_penalty_count_vocab_size = vocab_size + self.device_penalty_sparse_count_capacity = capacity + self.device_penalty_sparse_token_ids = torch.zeros( + (slot_capacity, capacity), dtype=torch.int32, device="cuda") + self.device_penalty_sparse_token_counts = torch.zeros( + (slot_capacity, capacity), dtype=torch.int32, device="cuda") + self.device_penalty_sparse_count_lens = torch.zeros( + (slot_capacity, ), dtype=torch.int32, device="cuda") + + def reset_device_penalty_count_slots(self) -> None: + if (not self.use_device_penalty_counts + or self.device_penalty_count_reset_slots is None + or self.device_penalty_count_reset_count == 0): + return + reset_slots = self.device_penalty_count_reset_slots[: + self.device_penalty_count_reset_count] + reset_slots = reset_slots[(reset_slots >= 0) + & + (reset_slots < self._max_num_sequence_slots())] + if reset_slots.numel() == 0: + self.device_penalty_count_reset_count = 0 + return + if self.device_penalty_count_mode == "dense": + if self.device_penalty_token_counts is None: + return + self.device_penalty_token_counts.index_fill_(0, reset_slots, 0) + else: + if self.device_penalty_sparse_count_lens is None: + return + self.device_penalty_sparse_count_lens.index_fill_(0, reset_slots, 0) + self.device_penalty_count_reset_count = 0 + + def init_device_penalty_count_prompt_tokens(self) -> None: + if (not self.use_device_penalty_counts + or self.device_penalty_count_prompt_tokens is None + or self.device_penalty_count_prompt_lens is None + or self.device_penalty_count_prompt_seq_slots is None + or self.device_penalty_count_prompt_count == 0): + return + + count = self.device_penalty_count_prompt_count + if self.device_penalty_count_mode == "dense": + if self.device_penalty_token_counts is None: + return + from .one_model_sampler import append_accepted_tokens_to_counts + append_accepted_tokens_to_counts( + self.device_penalty_token_counts, + self.device_penalty_count_prompt_seq_slots[:count], + self.device_penalty_count_prompt_tokens[:count].contiguous(), + self.device_penalty_count_prompt_lens[:count].contiguous()) + else: + if (self.device_penalty_sparse_token_ids is None + or self.device_penalty_sparse_token_counts is None + or self.device_penalty_sparse_count_lens is None + or self.device_penalty_count_prompt_token_counts is None): + return + width = self.device_penalty_count_prompt_capacity + from .one_model_sampler import init_sparse_token_counts + init_sparse_token_counts( + self.device_penalty_sparse_token_ids, + self.device_penalty_sparse_token_counts, + self.device_penalty_sparse_count_lens, + self.device_penalty_count_prompt_tokens[:count, : + width].contiguous(), + self.device_penalty_count_prompt_token_counts[:count, : + width].contiguous(), + self.device_penalty_count_prompt_lens[:count].contiguous(), + self.device_penalty_count_prompt_seq_slots[:count].contiguous(), + self.device_penalty_count_vocab_size) + self.device_penalty_count_prompt_count = 0 + + def prepare_device_penalty_counts(self, vocab_size: int) -> None: + if not self.use_device_penalty_counts: + return + self.ensure_device_penalty_count_buffers(vocab_size) + self.reset_device_penalty_count_slots() + self.init_device_penalty_count_prompt_tokens() + + def _ensure_device_penalty_count_prompt_buffers( + self, max_prompt_tokens: int) -> None: + if max_prompt_tokens <= 0: + return + if (self.device_penalty_count_prompt_tokens is not None + and self.device_penalty_count_prompt_token_counts is not None + and self.device_penalty_count_prompt_lens is not None + and self.device_penalty_count_prompt_seq_slots is not None + and self.device_penalty_count_prompt_capacity >= max_prompt_tokens): + return + + self.device_penalty_count_prompt_capacity = max_prompt_tokens + self.device_penalty_count_prompt_tokens = torch.zeros( + (self.max_num_requests, max_prompt_tokens), + dtype=torch.int32, + device="cuda") + self.device_penalty_count_prompt_token_counts = torch.zeros( + (self.max_num_requests, max_prompt_tokens), + dtype=torch.int32, + device="cuda") + self.device_penalty_count_prompt_lens = torch.zeros( + (self.max_num_requests, ), dtype=torch.int32, device="cuda") + self.device_penalty_count_prompt_seq_slots = torch.zeros( + (self.max_num_requests, ), dtype=torch.int32, device="cuda") + + def _populate_device_count_frequency_penalties( + self, requests: list["LlmRequest"]) -> bool: + debug_rows: list[dict[str, object]] = [] + + def debug_return(reason: str, **extra: object) -> None: + if int(os.environ.get("TRTLLM_SPEC_COUNT_DEBUG_TOKENS", "0")) <= 0: + return + if os.environ.get("TRTLLM_SPEC_COUNT_DEBUG_SKIP_ZERO", "1") == "1": + has_penalty_row = any( + float(row.get("frequency_penalty", 0.0)) != 0.0 + or float(row.get("presence_penalty", 0.0)) != 0.0 + for row in debug_rows) + if not has_penalty_row and reason != "enabled": + return + max_logs = max( + int(os.environ.get("TRTLLM_SPEC_COUNT_DEBUG_POPULATE_LOGS", + "8")), 0) + logs = int(getattr(self, "_device_count_populate_debug_logs", 0)) + if logs >= max_logs: + return + setattr(self, "_device_count_populate_debug_logs", logs + 1) + extra.setdefault("max_num_requests", self.max_num_requests) + extra.setdefault("slot_capacity", self._max_num_sequence_slots()) + logger.info( + "Spec count populate debug reason=%s env_use_counts=%s " + "env_mode=%s allow_advanced=%s spec_mode=%s use_one_engine=%s " + "request_count=%s rows=%s extra=%s", + reason, + os.environ.get("TRTLLM_SPEC_USE_DEVICE_COUNTS", "0"), + os.environ.get("TRTLLM_SPEC_COUNT_MODE", ""), + self.allow_advanced_sampling, + self.spec_dec_mode, + self.spec_dec_mode.use_one_engine(), + len(requests), + debug_rows, + extra) + + if os.environ.get("TRTLLM_SPEC_USE_DEVICE_COUNTS", "0") != "1": + self.use_device_penalty_counts = False + debug_return("env_disabled") + return False + if not self.allow_advanced_sampling or not self.spec_dec_mode.use_one_engine( + ): + self.use_device_penalty_counts = False + debug_return("not_advanced_or_not_one_engine") + return False + + row_slots: list[int] = [] + seq_slots: list[int] = [] + frequency_penalties: list[float] = [] + seq_frequency_penalties: list[float] = [] + reset_slots: list[int] = [] + prompt_init_slots: list[int] = [] + prompt_init_tokens: list[list[int]] = [] + prompt_init_token_counts: list[list[int]] = [] + any_penalty = False + can_use = True + count_mode = os.environ.get("TRTLLM_SPEC_COUNT_MODE", + "sparse").strip().lower() + if count_mode not in ("dense", "sparse"): + count_mode = "sparse" + force_graph_count_path = self._force_graph_count_path_enabled() + sparse_capacity_limit = int( + os.environ.get("TRTLLM_SPEC_SPARSE_COUNT_CAPACITY", "") or "0") + + next_slot_request_ids: dict[int, int] = {} + + for request in requests: + raw_slot = getattr(request, "py_seq_slot", None) + slot = int(raw_slot) if raw_slot is not None else -1 + valid_slot = (self._valid_seq_slot(slot) + and not getattr(request, "is_dummy", False)) + effective_slot = slot if valid_slot else -1 + seq_slots.append(effective_slot) + request_id = int(getattr(request, "py_request_id", + getattr(request, "request_id", -1))) + if valid_slot: + next_slot_request_ids[slot] = request_id + + sampling_config = request.sampling_config + frequency_penalty = float( + self._sampling_config_value(sampling_config, + "frequency_penalty", 0.0)) + presence_penalty = float( + self._sampling_config_value(sampling_config, + "presence_penalty", 0.0)) + prompt_ignore_length = int( + self._sampling_config_value(sampling_config, + "prompt_ignore_length", 0)) + raw_prompt_len = self._prompt_len(request) + if len(debug_rows) < 8: + state = getattr(request, "state", None) + debug_rows.append({ + "request_id": request_id, + "slot": slot, + "valid_slot": valid_slot, + "state": getattr(state, "name", str(state)), + "frequency_penalty": frequency_penalty, + "presence_penalty": presence_penalty, + "prompt_ignore_length": prompt_ignore_length, + "raw_prompt_len": raw_prompt_len, + }) + + if presence_penalty != 0.0: + can_use = False + break + + if not valid_slot: + frequency_penalty = 0.0 + + any_penalty = any_penalty or frequency_penalty != 0.0 + is_new_slot_request = valid_slot and ( + self.device_penalty_count_slot_request_ids.get(slot) != request_id) + if is_new_slot_request: + reset_slots.append(slot) + if frequency_penalty != 0.0: + ignore_length = self._effective_prompt_ignore_length( + request, prompt_ignore_length) + tokens = request.get_tokens(0) + count_history = [ + int(token) for token in tokens[ignore_length:] + if int(token) >= 0 + ] + if count_history: + if count_mode == "sparse": + counts = Counter(count_history) + unique_tokens = list(counts.keys()) + if (sparse_capacity_limit > 0 + and len(unique_tokens) > + sparse_capacity_limit): + # Do not disable device-side generated-token + # counts for the whole batch just because one + # request history cannot fit in the sparse table. + # The request still starts with an empty count + # table, and accepted generated tokens are + # appended below by the sampler. + continue + prompt_init_slots.append(slot) + prompt_init_tokens.append(unique_tokens) + prompt_init_token_counts.append([ + int(counts[token]) for token in unique_tokens + ]) + else: + prompt_init_slots.append(slot) + prompt_init_tokens.append(count_history) + prompt_init_token_counts.append([]) + + from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequestState + num_rows = 1 + self.runtime_draft_len if request.state == LlmRequestState.GENERATION_IN_PROGRESS else 1 + row_slots.extend(effective_slot for _ in range(num_rows)) + frequency_penalties.extend(frequency_penalty for _ in range(num_rows)) + seq_frequency_penalties.append(frequency_penalty) + + if not can_use or not row_slots or (not any_penalty + and not force_graph_count_path): + self.use_device_penalty_counts = False + debug_return("not_can_use_or_no_rows_or_no_penalty", + can_use=can_use, + row_count=len(row_slots), + any_penalty=any_penalty, + force_graph_count_path=force_graph_count_path) + return False + + self.device_penalty_count_slot_request_ids.update( + next_slot_request_ids) + + self.device_penalty_count_mode = count_mode + self._ensure_device_penalty_count_metadata_buffers() + assert self.device_penalty_count_row_slots is not None + assert self.device_penalty_count_seq_slots is not None + assert self.device_count_frequency_penalties is not None + assert self.device_count_seq_frequency_penalties is not None + assert self.device_penalty_count_reset_slots is not None + max_prompt_tokens = max((len(tokens) for tokens in prompt_init_tokens), + default=0) + self._ensure_device_penalty_count_prompt_buffers(max_prompt_tokens) + + self.device_penalty_count_seq_slots[:len(seq_slots)].copy_( + torch.tensor(seq_slots, dtype=torch.int32, + pin_memory=prefer_pinned()), + non_blocking=True) + self.device_penalty_count_row_slots[:len(row_slots)].copy_( + torch.tensor(row_slots, dtype=torch.int32, + pin_memory=prefer_pinned()), + non_blocking=True) + self.device_count_frequency_penalties[:len(frequency_penalties)].copy_( + torch.tensor(frequency_penalties, + dtype=torch.float32, + pin_memory=prefer_pinned()), + non_blocking=True) + self.device_count_seq_frequency_penalties[:len(seq_frequency_penalties + )].copy_( + torch.tensor( + seq_frequency_penalties, + dtype=torch.float32, + pin_memory=prefer_pinned( + )), + non_blocking=True) + self.device_penalty_count_reset_count = len(reset_slots) + if reset_slots: + self.device_penalty_count_reset_slots[:len(reset_slots)].copy_( + torch.tensor(reset_slots, + dtype=torch.int64, + pin_memory=prefer_pinned()), + non_blocking=True) + self.device_penalty_count_prompt_count = len(prompt_init_tokens) + if prompt_init_tokens: + assert self.device_penalty_count_prompt_tokens is not None + assert self.device_penalty_count_prompt_token_counts is not None + assert self.device_penalty_count_prompt_lens is not None + assert self.device_penalty_count_prompt_seq_slots is not None + prompt_tensor = torch.zeros( + (len(prompt_init_tokens), max_prompt_tokens), + dtype=torch.int32, + pin_memory=prefer_pinned()) + prompt_counts_tensor = torch.zeros( + (len(prompt_init_tokens), max_prompt_tokens), + dtype=torch.int32, + pin_memory=prefer_pinned()) + prompt_lens = [] + for row, tokens in enumerate(prompt_init_tokens): + prompt_lens.append(len(tokens)) + prompt_tensor[row, :len(tokens)] = torch.tensor( + tokens, dtype=torch.int32) + if count_mode == "sparse": + prompt_counts_tensor[row, :len(tokens)] = torch.tensor( + prompt_init_token_counts[row], dtype=torch.int32) + self.device_penalty_count_prompt_tokens[:len(prompt_init_tokens), + :max_prompt_tokens].copy_( + prompt_tensor, + non_blocking=True) + if count_mode == "sparse": + self.device_penalty_count_prompt_token_counts[:len( + prompt_init_tokens), :max_prompt_tokens].copy_( + prompt_counts_tensor, non_blocking=True) + self.device_penalty_count_prompt_lens[:len(prompt_lens)].copy_( + torch.tensor(prompt_lens, + dtype=torch.int32, + pin_memory=prefer_pinned()), + non_blocking=True) + self.device_penalty_count_prompt_seq_slots[:len( + prompt_init_slots)].copy_(torch.tensor( + prompt_init_slots, + dtype=torch.int32, + pin_memory=prefer_pinned()), + non_blocking=True) + self.use_device_penalty_counts = True + debug_return("enabled", + row_count=len(row_slots), + any_penalty=any_penalty, + force_graph_count_path=force_graph_count_path, + count_mode=count_mode, + seq_slots_head=seq_slots[:32], + row_slots_head=row_slots[:64], + reset_slots=reset_slots[:32], + prompt_init_slots=prompt_init_slots[:32], + prompt_init_count=len(prompt_init_tokens)) + return True + + def _populate_device_history_frequency_penalties( + self, requests: list["LlmRequest"]) -> bool: + if os.environ.get("TRTLLM_SPEC_USE_DEVICE_HISTORY", "0") != "1": + self.use_device_penalty_history = False + return False + if not self.allow_advanced_sampling or not self.spec_dec_mode.use_one_engine( + ): + self.use_device_penalty_history = False + return False + + row_slots: list[int] = [] + seq_slots: list[int] = [] + frequency_penalties: list[float] = [] + seq_frequency_penalties: list[float] = [] + reset_slots: list[int] = [] + can_use = True + row_mode = os.environ.get("TRTLLM_SPEC_PENALTY_ROW_MODE", + "all").strip().lower() + if row_mode not in ("all", "root"): + row_mode = "all" + + for request in requests: + raw_slot = getattr(request, "py_seq_slot", None) + slot = int(raw_slot) if raw_slot is not None else -1 + valid_slot = (self._valid_seq_slot(slot) + and not getattr(request, "is_dummy", False)) + effective_slot = slot if valid_slot else -1 + seq_slots.append(effective_slot) + + sampling_config = request.sampling_config + frequency_penalty = float( + self._sampling_config_value(sampling_config, + "frequency_penalty", 0.0)) + presence_penalty = float( + self._sampling_config_value(sampling_config, + "presence_penalty", 0.0)) + prompt_ignore_length = int( + self._sampling_config_value(sampling_config, + "prompt_ignore_length", 0)) + raw_prompt_len = self._prompt_len(request) + + # The device-history fast path intentionally covers the current + # NVBug workload: frequency penalty over generated tokens only. + # Other token-history semantics fall back to the slower probe path. + if presence_penalty != 0.0 or prompt_ignore_length < raw_prompt_len: + can_use = False + break + + if not valid_slot: + frequency_penalty = 0.0 + seq_frequency_penalties.append(frequency_penalty) + + generated_len = max(request.get_num_tokens(0) - raw_prompt_len, 0) + if valid_slot and generated_len == 0: + reset_slots.append(slot) + + from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequestState + num_rows = 1 + self.runtime_draft_len if request.state == LlmRequestState.GENERATION_IN_PROGRESS else 1 + row_slots.append(effective_slot) + frequency_penalties.append(frequency_penalty) + if num_rows > 1: + if row_mode == "all": + row_slots.extend(effective_slot for _ in range(num_rows - 1)) + frequency_penalties.extend(frequency_penalty + for _ in range(num_rows - 1)) + else: + row_slots.extend(-1 for _ in range(num_rows - 1)) + frequency_penalties.extend(0.0 for _ in range(num_rows - 1)) + + if not can_use or not row_slots: + self.use_device_penalty_history = False + return False + + self._ensure_device_penalty_history_buffers() + if not self.use_device_penalty_history and self.device_penalty_history_tokens is None: + return False + assert self.device_penalty_history_tokens is not None + assert self.device_penalty_history_lens is not None + assert self.device_penalty_row_slots is not None + assert self.device_penalty_seq_slots is not None + assert self.device_frequency_penalties is not None + assert self.device_seq_frequency_penalties is not None + + if reset_slots: + reset_slots_cuda = torch.tensor(reset_slots, + dtype=torch.int64, + pin_memory=prefer_pinned()).to( + "cuda", non_blocking=True) + self.device_penalty_history_lens.index_fill_(0, reset_slots_cuda, 0) + + self.device_penalty_seq_slots[:len(seq_slots)].copy_( + torch.tensor(seq_slots, dtype=torch.int32, + pin_memory=prefer_pinned()), + non_blocking=True) + self.device_penalty_row_slots[:len(row_slots)].copy_( + torch.tensor(row_slots, dtype=torch.int32, + pin_memory=prefer_pinned()), + non_blocking=True) + self.device_frequency_penalties[:len(frequency_penalties)].copy_( + torch.tensor(frequency_penalties, + dtype=torch.float32, + pin_memory=prefer_pinned()), + non_blocking=True) + self.device_seq_frequency_penalties[:len(seq_frequency_penalties)].copy_( + torch.tensor(seq_frequency_penalties, + dtype=torch.float32, + pin_memory=prefer_pinned()), + non_blocking=True) + self.use_device_penalty_history = True + return True + + def append_accepted_tokens_to_penalty_history( + self, accepted_tokens: torch.Tensor, + num_accepted_tokens: torch.Tensor, batch_size: int) -> None: + if (self.use_device_penalty_counts + and self.device_penalty_count_seq_slots is not None): + if (self.device_penalty_count_mode == "dense" + and self.device_penalty_token_counts is not None): + from .one_model_sampler import append_accepted_tokens_to_counts + append_accepted_tokens_to_counts( + self.device_penalty_token_counts, + self.device_penalty_count_seq_slots[:batch_size], + accepted_tokens[:batch_size].contiguous(), + num_accepted_tokens[:batch_size].contiguous()) + return + if (self.device_penalty_count_mode == "sparse" + and self.device_penalty_sparse_token_ids is not None + and self.device_penalty_sparse_token_counts is not None + and self.device_penalty_sparse_count_lens is not None): + from .one_model_sampler import append_accepted_tokens_to_sparse_counts + append_accepted_tokens_to_sparse_counts( + self.device_penalty_sparse_token_ids, + self.device_penalty_sparse_token_counts, + self.device_penalty_sparse_count_lens, + self.device_penalty_count_seq_slots[:batch_size], + accepted_tokens[:batch_size].contiguous(), + num_accepted_tokens[:batch_size].contiguous(), + self.device_penalty_count_vocab_size) + return + + if not self.use_device_penalty_history: + return + if self.device_penalty_history_tokens is None: + return + assert self.device_penalty_history_lens is not None + assert self.device_penalty_seq_slots is not None + + from .one_model_sampler import append_accepted_tokens_to_history + append_accepted_tokens_to_history( + self.device_penalty_history_tokens, + self.device_penalty_history_lens, + self.device_penalty_seq_slots[:batch_size], + accepted_tokens[:batch_size].contiguous(), + num_accepted_tokens[:batch_size].contiguous()) + + def _populate_recent_token_penalties_for_one_model( + self, requests: list["LlmRequest"]) -> None: + if not self.allow_advanced_sampling or not self.spec_dec_mode.use_one_engine( + ): + return + + width = int(os.environ.get("TRTLLM_SPEC_RECENT_PENALTY_TOKENS", "0")) + width = max(width, 0) + if width == 0: + return + + row_token_ids: list[list[int]] = [] + row_penalty_values: list[list[float]] = [] + seq_token_ids: list[list[int]] = [] + seq_penalty_values: list[list[float]] = [] + any_penalty = False + + for request in requests: + sampling_config = request.sampling_config + frequency_penalty = float( + self._sampling_config_value(sampling_config, + "frequency_penalty", 0.0)) + presence_penalty = float( + self._sampling_config_value(sampling_config, + "presence_penalty", 0.0)) + prompt_ignore_length = int( + self._sampling_config_value(sampling_config, + "prompt_ignore_length", 0)) + + from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequestState + num_rows = 1 + self.runtime_draft_len if request.state == LlmRequestState.GENERATION_IN_PROGRESS else 1 + + if frequency_penalty == 0.0 and presence_penalty == 0.0: + ids = [0] * width + penalties = [0.0] * width + else: + prompt_ignore_length = self._effective_prompt_ignore_length( + request, prompt_ignore_length) + tokens = request.get_tokens(0) + recent_start = max(prompt_ignore_length, len(tokens) - width) + counts: dict[int, int] = {} + for token in tokens[recent_start:]: + if token < 0: + continue + counts[token] = counts.get(token, 0) + 1 + items = list(counts.items())[:width] + ids = [token for token, _ in items] + penalties = [ + presence_penalty + frequency_penalty * count + for _, count in items + ] + if penalties: + any_penalty = True + pad = width - len(ids) + if pad > 0: + ids.extend([0] * pad) + penalties.extend([0.0] * pad) + + for _ in range(num_rows): + row_token_ids.append(ids) + row_penalty_values.append(penalties) + seq_token_ids.append(ids) + seq_penalty_values.append(penalties) + + if not row_token_ids: + return + + self._ensure_recent_penalty_buffers(width) + assert self.recent_penalty_token_ids is not None + assert self.recent_penalty_values is not None + assert self.recent_seq_penalty_token_ids is not None + assert self.recent_seq_penalty_values is not None + + num_rows = len(row_token_ids) + num_seqs = len(seq_token_ids) + if not any_penalty: + self.recent_penalty_values[:num_rows].zero_() + self.recent_seq_penalty_values[:num_seqs].zero_() + return + + self.recent_penalty_token_ids[:num_rows].copy_( + torch.tensor(row_token_ids, + dtype=torch.long, + pin_memory=prefer_pinned()), + non_blocking=True) + self.recent_penalty_values[:num_rows].copy_( + torch.tensor(row_penalty_values, + dtype=torch.float32, + pin_memory=prefer_pinned()), + non_blocking=True) + self.recent_seq_penalty_token_ids[:num_seqs].copy_( + torch.tensor(seq_token_ids, + dtype=torch.long, + pin_memory=prefer_pinned()), + non_blocking=True) + self.recent_seq_penalty_values[:num_seqs].copy_( + torch.tensor(seq_penalty_values, + dtype=torch.float32, + pin_memory=prefer_pinned()), + non_blocking=True) + def populate_sampling_params_for_one_model( self, requests: list["LlmRequest"]) -> None: """ @@ -498,6 +1435,10 @@ def populate_sampling_params_for_one_model( self.top_ps[:len(top_ps)].copy_(torch.tensor( top_ps, dtype=torch.float32, pin_memory=prefer_pinned()), non_blocking=True) + if self._populate_device_count_frequency_penalties(requests): + return + if not self._populate_device_history_frequency_penalties(requests): + self._populate_recent_token_penalties_for_one_model(requests) class SpecWorkerBase(nn.Module, ABC): @@ -547,13 +1488,14 @@ def skip_forward( next_new_tokens = torch.empty((batch_size, (self.max_draft_len + 1)), dtype=torch.int, device=logits.device) - return { + outputs = { 'logits': logits, 'new_tokens': accepted_tokens, 'new_tokens_lens': num_accepted_tokens, 'next_draft_tokens': next_draft_tokens, 'next_new_tokens': next_new_tokens } + return self._add_penalty_history_outputs(outputs, spec_metadata) def skip_drafting( self, @@ -585,7 +1527,6 @@ def skip_drafting( num_accepted_tokens = torch.ones(batch_size, dtype=torch.int, device=logits.device) - next_draft_tokens = torch.zeros((batch_size, 0), dtype=torch.int, device=logits.device) @@ -595,13 +1536,14 @@ def skip_drafting( device=logits.device) next_new_tokens[:, 0] = target_tokens - return { + outputs = { 'logits': logits, 'new_tokens': accepted_tokens, 'new_tokens_lens': num_accepted_tokens, 'next_draft_tokens': next_draft_tokens, 'next_new_tokens': next_new_tokens } + return self._add_penalty_history_outputs(outputs, spec_metadata) def set_guided_decoder(self, guided_decoder: "CapturableGuidedDecoder") -> bool: @@ -688,8 +1630,12 @@ def _sample_and_accept_draft_tokens_base( device=logits.device) # Sample tokens using per-request sampling parameters - target_tokens = self._sample_tokens_for_batch(logits, spec_metadata, - num_contexts, batch_size) + target_tokens = self._sample_tokens_for_batch( + logits, + spec_metadata, + num_contexts, + batch_size, + draft_tokens=draft_tokens) # Context requests: only accept the sampled token (no draft tokens yet) accepted_tokens[:num_contexts, 0] = target_tokens[:num_contexts] @@ -712,6 +1658,38 @@ def _sample_and_accept_draft_tokens_base( return accepted_tokens, num_accepted_tokens + @staticmethod + def _add_penalty_history_outputs(outputs: dict[str, torch.Tensor], + spec_metadata: SpecMetadata): + if spec_metadata.sampling_request_ids is not None: + outputs["penalty_sampling_request_ids"] = spec_metadata.sampling_request_ids + if spec_metadata.sampling_seq_slots is not None: + outputs["penalty_sampling_seq_slots"] = spec_metadata.sampling_seq_slots + if spec_metadata.use_device_penalty_counts: + if spec_metadata.device_penalty_count_seq_slots is not None: + outputs[ + "penalty_count_seq_slots"] = spec_metadata.device_penalty_count_seq_slots + if (spec_metadata.device_penalty_count_mode == "dense" + and spec_metadata.device_penalty_token_counts is not None): + outputs["penalty_token_counts"] = spec_metadata.device_penalty_token_counts + elif (spec_metadata.device_penalty_count_mode == "sparse" + and spec_metadata.device_penalty_sparse_token_ids is not None + and spec_metadata.device_penalty_sparse_token_counts is not None + and spec_metadata.device_penalty_sparse_count_lens is not None): + outputs["penalty_sparse_token_ids"] = spec_metadata.device_penalty_sparse_token_ids + outputs["penalty_sparse_token_counts"] = spec_metadata.device_penalty_sparse_token_counts + outputs["penalty_sparse_count_lens"] = spec_metadata.device_penalty_sparse_count_lens + outputs["penalty_count_vocab_size"] = spec_metadata.device_penalty_count_vocab_size + if (spec_metadata.use_device_penalty_history + and spec_metadata.device_penalty_history_tokens is not None + and spec_metadata.device_penalty_history_lens is not None): + if spec_metadata.device_penalty_seq_slots is not None: + outputs[ + "penalty_history_seq_slots"] = spec_metadata.device_penalty_seq_slots + outputs["penalty_history_tokens"] = spec_metadata.device_penalty_history_tokens + outputs["penalty_history_lens"] = spec_metadata.device_penalty_history_lens + return outputs + def _draft_sampler_greedy(self, logits: torch.Tensor, d2t=None): """ Simple greedy draft token sampling using argmax. @@ -731,11 +1709,261 @@ def _draft_sampler_greedy(self, logits: torch.Tensor, d2t=None): return draft_tokens.type(torch.int32) + @staticmethod + def _draft_prefix_frequency_penalties(spec_metadata: SpecMetadata): + if spec_metadata.device_count_seq_frequency_penalties is not None: + return spec_metadata.device_count_seq_frequency_penalties + if spec_metadata.device_seq_frequency_penalties is not None: + return spec_metadata.device_seq_frequency_penalties + return None + + def _apply_draft_prefix_penalty_values( + self, logits: torch.Tensor, spec_metadata: SpecMetadata, + row_token_ids: torch.Tensor, row_penalty_values: torch.Tensor, + num_rows: int, width: int) -> torch.Tensor: + if num_rows <= 0 or width <= 0: + return logits + if row_token_ids is None or row_penalty_values is None: + return logits + if os.environ.get("TRTLLM_SPEC_USE_PENALTY_OP", "0") == "1": + from .one_model_sampler import apply_recent_token_penalties + return apply_recent_token_penalties( + logits, row_token_ids[:num_rows, :width], + row_penalty_values[:num_rows, :width]) + # scatter_add handles duplicate prefix tokens in the same row correctly. + logits.scatter_add_(1, row_token_ids[:num_rows, :width].long(), + -row_penalty_values[:num_rows, :width].to( + logits.dtype)) + return logits + + def _apply_target_draft_prefix_frequency_penalty( + self, logits: torch.Tensor, spec_metadata: SpecMetadata, + num_contexts: int, batch_size: int, + draft_tokens: Optional[torch.Tensor]) -> torch.Tensor: + if os.environ.get("TRTLLM_SPEC_APPLY_DRAFT_PREFIX_PENALTY", + "0") != "1": + return logits + if draft_tokens is None or draft_tokens.numel() == 0: + return logits + runtime_draft_len = int(draft_tokens.shape[-1]) + if runtime_draft_len <= 0: + return logits + frequency_penalties = self._draft_prefix_frequency_penalties( + spec_metadata) + if frequency_penalties is None: + return logits + num_gens = batch_size - num_contexts + if num_gens <= 0: + return logits + + num_tokens = num_contexts + num_gens * (runtime_draft_len + 1) + spec_metadata._ensure_draft_prefix_penalty_buffers(runtime_draft_len) + token_ids = spec_metadata.draft_prefix_penalty_token_ids + penalty_values = spec_metadata.draft_prefix_penalty_values + rows = spec_metadata.draft_prefix_penalty_rows + assert token_ids is not None + assert penalty_values is not None + assert rows is not None + + token_ids[:num_tokens, :runtime_draft_len].zero_() + penalty_values[:num_tokens, :runtime_draft_len].zero_() + gen_frequency_penalties = frequency_penalties[ + num_contexts:batch_size].to(torch.float32) + gen_rows = rows[:num_gens] + row_stride = runtime_draft_len + 1 + for pos in range(1, runtime_draft_len + 1): + target_rows = num_contexts + gen_rows * row_stride + pos + token_ids[target_rows, :pos].copy_(draft_tokens[:, :pos]) + penalty_values[target_rows, :pos].copy_( + gen_frequency_penalties.unsqueeze(1).expand(-1, pos)) + return self._apply_draft_prefix_penalty_values( + logits, spec_metadata, token_ids, penalty_values, num_tokens, + runtime_draft_len) + + def _apply_draft_step_prefix_frequency_penalty( + self, logits: torch.Tensor, spec_metadata: SpecMetadata, + batch_size: int, + draft_prefix_tokens: Optional[torch.Tensor]) -> torch.Tensor: + if os.environ.get("TRTLLM_SPEC_APPLY_DRAFT_PREFIX_PENALTY", + "0") != "1": + return logits + if draft_prefix_tokens is None or draft_prefix_tokens.numel() == 0: + return logits + prefix_len = int(draft_prefix_tokens.shape[-1]) + if prefix_len <= 0: + return logits + frequency_penalties = self._draft_prefix_frequency_penalties( + spec_metadata) + if frequency_penalties is None: + return logits + + spec_metadata._ensure_draft_prefix_penalty_buffers(prefix_len) + token_ids = spec_metadata.draft_prefix_penalty_token_ids + penalty_values = spec_metadata.draft_prefix_penalty_values + assert token_ids is not None + assert penalty_values is not None + + token_ids[:batch_size, :prefix_len].copy_( + draft_prefix_tokens[:batch_size, :prefix_len]) + penalty_values[:batch_size, :prefix_len].copy_( + frequency_penalties[:batch_size].to(torch.float32).unsqueeze(1). + expand(-1, prefix_len)) + return self._apply_draft_prefix_penalty_values( + logits, spec_metadata, token_ids, penalty_values, batch_size, + prefix_len) + + def _maybe_apply_history_penalty_to_draft_logits( + self, logits: torch.Tensor, spec_metadata: SpecMetadata, + batch_size: int, d2t=None, + draft_prefix_tokens: Optional[torch.Tensor] = None): + if os.environ.get("TRTLLM_SPEC_APPLY_HISTORY_TO_DRAFT", "0") != "1": + return self._apply_draft_step_prefix_frequency_penalty( + logits, spec_metadata, batch_size, draft_prefix_tokens) + if d2t is not None: + return logits + if (spec_metadata.use_device_penalty_counts + and spec_metadata.device_penalty_count_seq_slots is not None + and spec_metadata.device_count_seq_frequency_penalties is not None): + from .one_model_sampler import (apply_count_frequency_penalty, + apply_sparse_count_frequency_penalty) + spec_metadata.ensure_device_penalty_count_buffers( + int(logits.shape[-1])) + if spec_metadata.device_penalty_count_mode == "dense": + if spec_metadata.device_penalty_token_counts is None: + return logits + logits = apply_count_frequency_penalty( + logits, spec_metadata.device_penalty_token_counts, + spec_metadata.device_penalty_count_seq_slots[:batch_size], + spec_metadata.device_count_seq_frequency_penalties[:batch_size]) + return self._apply_draft_step_prefix_frequency_penalty( + logits, spec_metadata, batch_size, draft_prefix_tokens) + if (spec_metadata.device_penalty_sparse_token_ids is None + or spec_metadata.device_penalty_sparse_token_counts is None + or spec_metadata.device_penalty_sparse_count_lens is None): + return logits + logits = apply_sparse_count_frequency_penalty( + logits, spec_metadata.device_penalty_sparse_token_ids, + spec_metadata.device_penalty_sparse_token_counts, + spec_metadata.device_penalty_sparse_count_lens, + spec_metadata.device_penalty_count_seq_slots[:batch_size], + spec_metadata.device_count_seq_frequency_penalties[:batch_size]) + return self._apply_draft_step_prefix_frequency_penalty( + logits, spec_metadata, batch_size, draft_prefix_tokens) + if (spec_metadata.recent_seq_penalty_token_ids is not None + and spec_metadata.recent_seq_penalty_values is not None): + from .one_model_sampler import apply_recent_token_penalties + logits = apply_recent_token_penalties( + logits, + spec_metadata.recent_seq_penalty_token_ids[:batch_size], + spec_metadata.recent_seq_penalty_values[:batch_size]) + return self._apply_draft_step_prefix_frequency_penalty( + logits, spec_metadata, batch_size, draft_prefix_tokens) + if (not spec_metadata.use_device_penalty_history + or logits.dtype != torch.float32 + or spec_metadata.device_penalty_history_tokens is None + or spec_metadata.device_penalty_history_lens is None + or spec_metadata.device_penalty_seq_slots is None + or spec_metadata.device_seq_frequency_penalties is None): + return logits + + from .one_model_sampler import apply_history_frequency_penalty + logits = apply_history_frequency_penalty( + logits, spec_metadata.device_penalty_history_tokens, + spec_metadata.device_penalty_history_lens, + spec_metadata.device_penalty_seq_slots[:batch_size], + spec_metadata.device_seq_frequency_penalties[:batch_size]) + return self._apply_draft_step_prefix_frequency_penalty( + logits, spec_metadata, batch_size, draft_prefix_tokens) + def _execute_guided_decoder_if_present(self, logits): """Execute guided decoder on target model logits if available.""" if self.guided_decoder is not None: self.guided_decoder.execute(logits) + def _debug_sampled_count_penalty( + self, logits: torch.Tensor, sampled_tokens: torch.Tensor, + spec_metadata: SpecMetadata, num_tokens: int) -> None: + if spec_metadata.is_cuda_graph: + return + threshold = int( + os.environ.get("TRTLLM_SPEC_SAMPLE_DEBUG_COUNT_THRESHOLD", "0")) + if threshold <= 0: + return + max_logs = max( + int(os.environ.get("TRTLLM_SPEC_SAMPLE_DEBUG_MAX_LOGS", "64")), 0) + if max_logs <= 0: + return + logs = int(getattr(self, "_sample_count_debug_logs", 0)) + if logs >= max_logs: + return + if (spec_metadata.device_penalty_count_mode != "sparse" + or spec_metadata.device_penalty_count_row_slots is None + or spec_metadata.device_count_frequency_penalties is None + or spec_metadata.device_penalty_sparse_token_ids is None + or spec_metadata.device_penalty_sparse_token_counts is None + or spec_metadata.device_penalty_sparse_count_lens is None): + return + + row_limit = min( + num_tokens, + int(os.environ.get("TRTLLM_SPEC_SAMPLE_DEBUG_ROW_LIMIT", "256"))) + if row_limit <= 0: + return + + row_slots = spec_metadata.device_penalty_count_row_slots[:row_limit].detach( + ).cpu().tolist() + frequency_penalties = spec_metadata.device_count_frequency_penalties[: + row_limit].detach( + ).cpu( + ).tolist( + ) + sampled = sampled_tokens[:row_limit].detach().cpu().tolist() + count_lens = spec_metadata.device_penalty_sparse_count_lens.detach( + ).cpu() + debug_rows = [] + + for row, (slot, token, penalty) in enumerate( + zip(row_slots, sampled, frequency_penalties)): + if logs + len(debug_rows) >= max_logs: + break + slot = int(slot) + token = int(token) + penalty = float(penalty) + if slot < 0 or penalty == 0.0 or slot >= count_lens.numel(): + continue + count_len = int(count_lens[slot].item()) + if count_len <= 0: + continue + token_ids = spec_metadata.device_penalty_sparse_token_ids[ + slot, :count_len].detach().cpu() + matches = (token_ids == token).nonzero(as_tuple=False) + if matches.numel() == 0: + continue + count_index = int(matches[0].item()) + count = int(spec_metadata.device_penalty_sparse_token_counts[ + slot, count_index].detach().cpu().item()) + if count < threshold: + continue + row_logits = logits[row].detach().float() + selected_logit = float(row_logits[token].item()) + max_logit = float(row_logits.max().item()) + debug_rows.append({ + "row": row, + "slot": slot, + "token": token, + "count": count, + "count_len": count_len, + "frequency_penalty": penalty, + "selected_logit": selected_logit, + "row_max_logit": max_logit, + "selected_minus_max": selected_logit - max_logit, + }) + + if not debug_rows: + return + setattr(self, "_sample_count_debug_logs", logs + len(debug_rows)) + logger.info("Spec sample count penalty debug rows=%s", debug_rows) + def _prepare_next_new_tokens(self, accepted_tokens, next_draft_tokens, batch_indices_cuda, batch_size, num_accepted_tokens): @@ -846,6 +2074,7 @@ def _sample_tokens_for_batch( spec_metadata: SpecMetadata, num_contexts: int, batch_size: int, + draft_tokens: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ Sample tokens from logits using per-request sampling parameters. @@ -861,7 +2090,11 @@ def _sample_tokens_for_batch( sampled_tokens: [num_tokens] - Sampled token ids """ if spec_metadata.allow_advanced_sampling: - from .one_model_sampler import sampling_batch_spec_dec_one_model + from .one_model_sampler import (apply_count_frequency_penalty, + apply_history_frequency_penalty, + apply_recent_token_penalties, + apply_sparse_count_frequency_penalty, + sampling_batch_spec_dec_one_model) num_gens = batch_size - num_contexts num_tokens = num_contexts + num_gens * ( @@ -870,6 +2103,56 @@ def _sample_tokens_for_batch( temperatures = spec_metadata.temperatures[:num_tokens] top_ks = spec_metadata.top_ks[:num_tokens] top_ps = spec_metadata.top_ps[:num_tokens] + if (spec_metadata.use_device_penalty_counts + and spec_metadata.device_penalty_count_row_slots is not None + and spec_metadata.device_count_frequency_penalties is not None): + spec_metadata.ensure_device_penalty_count_buffers( + int(logits.shape[-1])) + if (spec_metadata.device_penalty_count_mode == "dense" + and spec_metadata.device_penalty_token_counts is not None): + logits = apply_count_frequency_penalty( + logits, + spec_metadata.device_penalty_token_counts, + spec_metadata.device_penalty_count_row_slots[:num_tokens], + spec_metadata.device_count_frequency_penalties[: + num_tokens]) + elif (spec_metadata.device_penalty_count_mode == "sparse" + and spec_metadata.device_penalty_sparse_token_ids is not None + and spec_metadata.device_penalty_sparse_token_counts is not None + and spec_metadata.device_penalty_sparse_count_lens is not None): + logits = apply_sparse_count_frequency_penalty( + logits, + spec_metadata.device_penalty_sparse_token_ids, + spec_metadata.device_penalty_sparse_token_counts, + spec_metadata.device_penalty_sparse_count_lens, + spec_metadata.device_penalty_count_row_slots[:num_tokens], + spec_metadata.device_count_frequency_penalties[: + num_tokens]) + elif (spec_metadata.use_device_penalty_history + and logits.dtype == torch.float32 + and spec_metadata.device_penalty_history_tokens is not None + and spec_metadata.device_penalty_history_lens is not None + and spec_metadata.device_penalty_row_slots is not None + and spec_metadata.device_frequency_penalties is not None): + logits = apply_history_frequency_penalty( + logits, + spec_metadata.device_penalty_history_tokens, + spec_metadata.device_penalty_history_lens, + spec_metadata.device_penalty_row_slots[:num_tokens], + spec_metadata.device_frequency_penalties[:num_tokens]) + else: + recent_penalty_token_ids = ( + None if spec_metadata.recent_penalty_token_ids is None else + spec_metadata.recent_penalty_token_ids[:num_tokens]) + recent_penalty_values = ( + None if spec_metadata.recent_penalty_values is None else + spec_metadata.recent_penalty_values[:num_tokens]) + if recent_penalty_token_ids is not None and recent_penalty_values is not None: + logits = apply_recent_token_penalties( + logits, recent_penalty_token_ids, recent_penalty_values) + + logits = self._apply_target_draft_prefix_frequency_penalty( + logits, spec_metadata, num_contexts, batch_size, draft_tokens) if self.use_flashinfer: top_ks = top_ks.clamp(min=1, max=logits.shape[-1] - 1) @@ -892,6 +2175,8 @@ def _sample_tokens_for_batch( use_flashinfer=self.use_flashinfer, seed=self.seed, offset=self.offset) + self._debug_sampled_count_penalty(logits, sampled_tokens, + spec_metadata, num_tokens) else: sampled_tokens = torch.argmax(logits, dim=-1) diff --git a/tensorrt_llm/_torch/speculative/one_model_sampler.py b/tensorrt_llm/_torch/speculative/one_model_sampler.py index 7d49aa85dd1..e337ee84b54 100644 --- a/tensorrt_llm/_torch/speculative/one_model_sampler.py +++ b/tensorrt_llm/_torch/speculative/one_model_sampler.py @@ -1,3 +1,4 @@ +import os from typing import Optional import torch @@ -73,6 +74,106 @@ def apply_temperature( return logits.div_(temp.unsqueeze(dim=1)) +def apply_recent_token_penalties( + logits: torch.Tensor, + token_ids: torch.Tensor, + penalty_values: torch.Tensor, +) -> torch.Tensor: + if os.environ.get("TRTLLM_SPEC_USE_PENALTY_OP", "0") == "1": + torch.ops.trtllm.speculative_apply_token_penalties( + logits, token_ids, penalty_values) + return logits + return logits.scatter_add_(1, token_ids, -penalty_values.to(logits.dtype)) + + +def apply_history_frequency_penalty( + logits: torch.Tensor, + history_tokens: torch.Tensor, + history_lens: torch.Tensor, + row_slots: torch.Tensor, + frequency_penalties: torch.Tensor, +) -> torch.Tensor: + torch.ops.trtllm.speculative_apply_history_frequency_penalty( + logits, history_tokens, history_lens, row_slots, frequency_penalties) + return logits + + +def apply_count_frequency_penalty( + logits: torch.Tensor, + token_counts: torch.Tensor, + row_slots: torch.Tensor, + frequency_penalties: torch.Tensor, +) -> torch.Tensor: + torch.ops.trtllm.speculative_apply_count_frequency_penalty( + logits, token_counts, row_slots, frequency_penalties) + return logits + + +def apply_sparse_count_frequency_penalty( + logits: torch.Tensor, + token_ids: torch.Tensor, + token_counts: torch.Tensor, + count_lens: torch.Tensor, + row_slots: torch.Tensor, + frequency_penalties: torch.Tensor, +) -> torch.Tensor: + torch.ops.trtllm.speculative_apply_sparse_count_frequency_penalty( + logits, token_ids, token_counts, count_lens, row_slots, + frequency_penalties) + return logits + + +def append_accepted_tokens_to_history( + history_tokens: torch.Tensor, + history_lens: torch.Tensor, + seq_slots: torch.Tensor, + accepted_tokens: torch.Tensor, + accepted_lens: torch.Tensor, +) -> None: + torch.ops.trtllm.speculative_append_accepted_tokens( + history_tokens, history_lens, seq_slots, accepted_tokens, + accepted_lens) + + +def append_accepted_tokens_to_counts( + token_counts: torch.Tensor, + seq_slots: torch.Tensor, + accepted_tokens: torch.Tensor, + accepted_lens: torch.Tensor, +) -> None: + torch.ops.trtllm.speculative_append_accepted_token_counts( + token_counts, seq_slots, accepted_tokens, accepted_lens) + + +def append_accepted_tokens_to_sparse_counts( + token_ids: torch.Tensor, + token_counts: torch.Tensor, + count_lens: torch.Tensor, + seq_slots: torch.Tensor, + accepted_tokens: torch.Tensor, + accepted_lens: torch.Tensor, + vocab_size: int, +) -> None: + torch.ops.trtllm.speculative_append_sparse_token_counts( + token_ids, token_counts, count_lens, seq_slots, accepted_tokens, + accepted_lens, vocab_size) + + +def init_sparse_token_counts( + token_ids: torch.Tensor, + token_counts: torch.Tensor, + count_lens: torch.Tensor, + prompt_token_ids: torch.Tensor, + prompt_token_counts: torch.Tensor, + prompt_lens: torch.Tensor, + seq_slots: torch.Tensor, + vocab_size: int, +) -> None: + torch.ops.trtllm.speculative_init_sparse_token_counts( + token_ids, token_counts, count_lens, prompt_token_ids, + prompt_token_counts, prompt_lens, seq_slots, vocab_size) + + @torch.compile(options={"max-autotune": True}) def sampling_batch_spec_dec_one_model( logits: torch.Tensor, diff --git a/tensorrt_llm/_torch/speculative/spec_sampler_base.py b/tensorrt_llm/_torch/speculative/spec_sampler_base.py index a795a19d6d8..b6550a428c5 100644 --- a/tensorrt_llm/_torch/speculative/spec_sampler_base.py +++ b/tensorrt_llm/_torch/speculative/spec_sampler_base.py @@ -19,11 +19,13 @@ Eagle3OneModelSampler. """ +import os from dataclasses import dataclass from typing import Optional import torch +from tensorrt_llm.bindings.executor import FinishReason from tensorrt_llm.logger import logger from ..pyexecutor.llm_request import LlmRequest, LlmRequestState @@ -99,10 +101,27 @@ def __init__(self, args: TorchSampler.Args, *, draft_len: int): self.max_seq_len = args.max_seq_len seq_slots = args.max_num_sequences + self.max_num_sequences = seq_slots max_tokens = self._get_max_tokens(args, draft_len) draft_tokens_size = self._get_draft_tokens_storage_size(args, draft_len) self.max_beam_width = args.max_beam_width assert self.max_beam_width == 1, "beam width must be 1 for speculative decoding" + self.loop_guard_tokens = int( + os.environ.get("TRTLLM_SPEC_LOOP_GUARD_TOKENS", "0")) + self.loop_guard_period = max( + int(os.environ.get("TRTLLM_SPEC_LOOP_GUARD_PERIOD", "64")), 1) + self.loop_guard_repeats = max( + int(os.environ.get("TRTLLM_SPEC_LOOP_GUARD_REPEATS", "4")), 2) + self.loop_guard_window = max( + int(os.environ.get("TRTLLM_SPEC_LOOP_GUARD_WINDOW", "1024")), 128) + self.count_debug_tokens = int( + os.environ.get("TRTLLM_SPEC_COUNT_DEBUG_TOKENS", "0")) + self.count_debug_period = max( + int(os.environ.get("TRTLLM_SPEC_COUNT_DEBUG_PERIOD", "256")), 1) + self.count_debug_max_rows = max( + int(os.environ.get("TRTLLM_SPEC_COUNT_DEBUG_MAX_ROWS", "4")), 1) + self.count_debug_path_logged = False + self.count_debug_order_logged = False self.store = self.Store( new_tokens=int_tensor((max_tokens, seq_slots, self.max_beam_width)), @@ -137,6 +156,11 @@ def _add_dummy_draft_tokens(self) -> bool: """ return True + def _is_valid_slot(self, request: LlmRequest) -> bool: + slot = request.py_seq_slot + return (slot is not None and 0 <= int(slot) < self.max_num_sequences + and not request.is_dummy) + def _request_common_handling( self, request: LlmRequest, @@ -161,9 +185,302 @@ def _request_common_handling( "return_log_probs not supported with speculative decoding, skipping for request %s", request.py_request_id, ) - request.py_draft_tokens = next_draft_tokens[request.py_seq_slot][:runtime_draft_len] + request.py_draft_tokens = next_draft_tokens[ + request.py_seq_slot][:runtime_draft_len] request.py_decoding_iter += 1 + def _ordered_sampling_requests_from_outputs( + self, + fallback_requests: list[LlmRequest], + outputs: dict[str, torch.Tensor], + num_skip: int) -> list[LlmRequest]: + request_ids = outputs.get("penalty_sampling_request_ids") + if request_ids is None: + return fallback_requests + + output_request_ids = [int(request_id) for request_id in request_ids] + output_seq_slots = outputs.get("penalty_sampling_seq_slots") + if output_seq_slots is not None: + output_seq_slots = [int(slot) for slot in output_seq_slots] + + row_request_ids = output_request_ids[ + num_skip:num_skip + len(fallback_requests)] + row_seq_slots = None + if output_seq_slots is not None: + row_seq_slots = output_seq_slots[ + num_skip:num_skip + len(fallback_requests)] + if len(row_request_ids) != len(fallback_requests): + return fallback_requests + + available = list(fallback_requests) + ordered_requests: list[LlmRequest] = [] + for row, request_id in enumerate(row_request_ids): + row_slot = row_seq_slots[row] if row_seq_slots is not None else None + match_idx = None + for idx, request in enumerate(available): + if int(request.py_request_id) != request_id: + continue + if row_slot is not None: + slot = request.py_seq_slot + request_slot = int(slot) if slot is not None else -1 + if request_slot != row_slot: + continue + match_idx = idx + break + if match_idx is None and row_slot is not None: + for idx, request in enumerate(available): + if int(request.py_request_id) == request_id: + match_idx = idx + break + if match_idx is None: + return fallback_requests + ordered_requests.append(available.pop(match_idx)) + + if (self.count_debug_tokens > 0 and not self.count_debug_order_logged): + fallback_head = [ + (int(request.py_request_id), + int(request.py_seq_slot) if request.py_seq_slot is not None else -1) + for request in fallback_requests[:16] + ] + ordered_head = [ + (int(request.py_request_id), + int(request.py_seq_slot) if request.py_seq_slot is not None else -1) + for request in ordered_requests[:16] + ] + if ordered_head != fallback_head: + self.count_debug_order_logged = True + logger.info( + "Spec sampler remapped output row request order " + "fallback_head=%s output_head=%s", + fallback_head, ordered_head) + + return ordered_requests + + @staticmethod + def _prompt_len(request: LlmRequest) -> int: + for attr in ("py_orig_prompt_len", "orig_prompt_len", "py_prompt_len", + "prompt_len"): + value = getattr(request, attr, None) + if value is not None: + return max(int(value), 0) + return 0 + + @staticmethod + def _has_repeated_suffix(tokens: list[int], repeats: int) -> bool: + if len(tokens) >= 256 and len(set(tokens[-256:])) <= 8: + return True + for ngram in (16, 8, 4): + needed = ngram * repeats + if len(tokens) < needed: + continue + suffix = tokens[-ngram:] + repeated = True + for i in range(2, repeats + 1): + start = -ngram * i + end = start + ngram + if tokens[start:end] != suffix: + repeated = False + break + if repeated: + return True + return False + + def _has_repeated_window(self, tokens: list[int]) -> bool: + window = tokens[-self.loop_guard_window:] + if len(window) < 512: + return False + if len(set(window[-512:])) <= 32: + return True + for ngram, min_count in ((16, 3), (12, 4), (8, 8)): + if len(window) < ngram * min_count: + continue + counts: dict[tuple[int, ...], int] = {} + for start in range(0, len(window) - ngram + 1): + key = tuple(window[start:start + ngram]) + count = counts.get(key, 0) + 1 + if count >= min_count: + return True + counts[key] = count + return False + + def _maybe_finish_repetition_loop(self, request: LlmRequest, beam_idx: int, + prev_generated_len: int, + generated_len: int) -> bool: + if self.loop_guard_tokens <= 0: + return False + if generated_len < self.loop_guard_tokens: + return False + if (prev_generated_len >= self.loop_guard_tokens + and (generated_len // self.loop_guard_period + == prev_generated_len // self.loop_guard_period)): + return False + tokens = request.get_tokens(beam_idx) + generated_tokens = tokens[-generated_len:] + if (not self._has_repeated_suffix(generated_tokens, + self.loop_guard_repeats) + and not self._has_repeated_window(generated_tokens)): + return False + + logger.warning( + "Speculative decoding repetition guard stopped request_id=%s " + "generated_len=%s", + request.py_request_id, generated_len) + request.finish_by(FinishReason.STOP_WORDS, beam_idx) + return True + + def _should_debug_count_row(self, request: LlmRequest, + accepted_len: int) -> tuple[bool, int, int]: + if self.count_debug_tokens <= 0 or accepted_len <= 0: + return False, 0, 0 + prompt_len = self._prompt_len(request) + prev_generated_len = max(len(request.get_tokens(0)) - prompt_len, 0) + next_generated_len = prev_generated_len + accepted_len + if next_generated_len < self.count_debug_tokens: + return False, prev_generated_len, next_generated_len + prev_bucket = max(prev_generated_len - self.count_debug_tokens, + 0) // self.count_debug_period + next_bucket = max(next_generated_len - self.count_debug_tokens, + 0) // self.count_debug_period + return (prev_generated_len < self.count_debug_tokens + or next_bucket != prev_bucket), prev_generated_len, next_generated_len + + def _maybe_log_count_debug_path(self, outputs: dict[str, torch.Tensor]) -> None: + if self.count_debug_tokens <= 0 or self.count_debug_path_logged: + return + self.count_debug_path_logged = True + logger.info( + "Spec count debug path env_mode=%s dense=%s sparse=%s " + "history=%s history_appended=%s", + os.environ.get("TRTLLM_SPEC_COUNT_MODE", ""), + outputs.get("penalty_token_counts") is not None, + (outputs.get("penalty_sparse_token_ids") is not None + and outputs.get("penalty_sparse_token_counts") is not None + and outputs.get("penalty_sparse_count_lens") is not None), + (outputs.get("penalty_history_tokens") is not None + and outputs.get("penalty_history_lens") is not None), + outputs.get("penalty_history_appended", False)) + + def _debug_dense_counts_before_append( + self, + sampling_requests: list[LlmRequest], + penalty_slot_values: list[int], + o_new_tokens: torch.Tensor, + o_new_tokens_lens: torch.Tensor, + token_counts: torch.Tensor, + ) -> None: + if self.count_debug_tokens <= 0: + return + + accepted_lens = o_new_tokens_lens.detach().cpu().tolist() + rows_logged = 0 + for row, request in enumerate(sampling_requests): + if rows_logged >= self.count_debug_max_rows: + return + if row >= len(penalty_slot_values): + continue + slot = penalty_slot_values[row] + if slot < 0: + continue + accepted_len = int(accepted_lens[row]) + should_log, prev_generated_len, next_generated_len = ( + self._should_debug_count_row(request, accepted_len)) + if not should_log: + continue + + accepted_tokens = o_new_tokens[row, :accepted_len].detach().cpu().tolist() + history_tail = [ + int(token) for token in request.get_tokens(0)[-16:] + if int(token) >= 0 + ] + interesting_tokens = list(dict.fromkeys( + [int(token) for token in accepted_tokens] + history_tail)) + if interesting_tokens: + interesting_tensor = torch.tensor( + interesting_tokens, + dtype=torch.long, + device=token_counts.device) + interesting_counts = token_counts[slot].index_select( + 0, interesting_tensor).detach().cpu().tolist() + count_by_token = { + token: int(count) + for token, count in zip(interesting_tokens, + interesting_counts) + } + else: + count_by_token = {} + accepted_counts = [(int(token), count_by_token.get(int(token), 0)) + for token in accepted_tokens] + history_tail_counts = [(token, count_by_token.get(token, 0)) + for token in history_tail] + + logger.info( + "Spec dense count debug before append request_id=%s slot=%s " + "prev_generated_len=%s next_generated_len=%s accepted_len=%s " + "accepted_counts=%s history_tail_counts=%s", + request.py_request_id, slot, prev_generated_len, + next_generated_len, accepted_len, accepted_counts, + history_tail_counts) + rows_logged += 1 + + def _debug_sparse_counts_before_append( + self, + sampling_requests: list[LlmRequest], + penalty_slot_values: list[int], + o_new_tokens: torch.Tensor, + o_new_tokens_lens: torch.Tensor, + token_ids: torch.Tensor, + token_counts: torch.Tensor, + count_lens: torch.Tensor, + count_vocab_size: int, + ) -> None: + if self.count_debug_tokens <= 0: + return + + accepted_lens = o_new_tokens_lens.detach().cpu().tolist() + rows_logged = 0 + for row, request in enumerate(sampling_requests): + if rows_logged >= self.count_debug_max_rows: + return + if row >= len(penalty_slot_values): + continue + slot = penalty_slot_values[row] + if slot < 0: + continue + accepted_len = int(accepted_lens[row]) + should_log, prev_generated_len, next_generated_len = ( + self._should_debug_count_row(request, accepted_len)) + if not should_log: + continue + + count_len = int(count_lens[slot].detach().cpu().item()) + count_len = min(max(count_len, 0), int(token_ids.shape[1])) + sparse_ids = token_ids[slot, :count_len].detach().cpu().tolist() + sparse_counts = token_counts[slot, :count_len].detach().cpu().tolist() + count_by_token = { + int(token): int(count) + for token, count in zip(sparse_ids, sparse_counts) + } + + accepted_tokens = o_new_tokens[row, :accepted_len].detach().cpu().tolist() + accepted_counts = [(int(token), count_by_token.get(int(token), 0)) + for token in accepted_tokens] + history_tail = [ + int(token) for token in request.get_tokens(0)[-16:] + if int(token) >= 0 + ] + history_tail_counts = [(token, count_by_token.get(token, 0)) + for token in history_tail] + + logger.info( + "Spec sparse count debug before append request_id=%s slot=%s " + "prev_generated_len=%s next_generated_len=%s accepted_len=%s " + "count_len=%s count_vocab_size=%s accepted_counts=%s " + "history_tail_counts=%s", + request.py_request_id, slot, prev_generated_len, + next_generated_len, accepted_len, count_len, count_vocab_size, + accepted_counts, history_tail_counts) + rows_logged += 1 + def update_requests( self, state: SampleStateSpec, @@ -196,6 +513,15 @@ def update_requests( req, new_token, max_seq_len=self.max_seq_len, beam_idx=beam_idx ): break + if (self.loop_guard_tokens > 0 + and req.state != LlmRequestState.GENERATION_COMPLETE): + guard_len_attr = "_spec_loop_guard_generated_len" + prev_generated_len = getattr(req, guard_len_attr, 0) + generated_len = prev_generated_len + num_new_tokens + setattr(req, guard_len_attr, generated_len) + self._maybe_finish_repetition_loop(req, beam_idx, + prev_generated_len, + generated_len) req.py_num_accepted_draft_tokens = num_new_tokens - 1 req.py_rewind_len = runtime_draft_len - req.py_num_accepted_draft_tokens self._request_common_handling(req, next_draft_tokens_list, runtime_draft_len) @@ -224,11 +550,53 @@ def sample_async( """ num_skip = len(scheduled_requests.context_requests_chunking) finished_context_requests = scheduled_requests.context_requests_last_chunk - sampling_requests = finished_context_requests + scheduled_requests.generation_requests + fallback_sampling_requests = ( + finished_context_requests + scheduled_requests.generation_requests) + sampling_requests = self._ordered_sampling_requests_from_outputs( + fallback_sampling_requests, outputs, num_skip) num_sampling_requests = len(sampling_requests) - slots = torch.as_tensor([r.py_seq_slot for r in sampling_requests], dtype=torch.long) + valid_positions: list[int] = [] + valid_slot_values: list[int] = [] + penalty_slot_values: list[int] = [] + valid_sampling_requests: list[LlmRequest] = [] + for pos, request in enumerate(sampling_requests): + if self._is_valid_slot(request): + slot = int(request.py_seq_slot) + valid_positions.append(pos) + valid_slot_values.append(slot) + penalty_slot_values.append(slot) + valid_sampling_requests.append(request) + else: + penalty_slot_values.append(-1) + + slots = torch.as_tensor(valid_slot_values, dtype=torch.long) slots = slots.to(device="cuda", non_blocking=True) + penalty_slots: Optional[torch.Tensor] = None + + def fallback_penalty_slots() -> torch.Tensor: + nonlocal penalty_slots + if penalty_slots is None: + penalty_slots = torch.as_tensor(penalty_slot_values, + dtype=torch.int32) + penalty_slots = penalty_slots.to(device="cuda", + non_blocking=True) + return penalty_slots + + def output_slot_slice(name: str) -> Optional[torch.Tensor]: + seq_slots = outputs.get(name) + if seq_slots is None: + return None + end = num_skip + num_sampling_requests + if seq_slots.numel() < end: + return None + return seq_slots[num_skip:end].contiguous() + + count_penalty_slots = output_slot_slice("penalty_count_seq_slots") + history_penalty_slots = output_slot_slice("penalty_history_seq_slots") + valid_positions_cuda = torch.as_tensor(valid_positions, + dtype=torch.long, + device="cuda") o_new_tokens = outputs["new_tokens"][num_skip : num_skip + num_sampling_requests] o_new_tokens_lens = outputs["new_tokens_lens"][num_skip : num_skip + num_sampling_requests] @@ -238,6 +606,59 @@ def sample_async( o_next_new_tokens = outputs["next_new_tokens"][num_skip : num_skip + num_sampling_requests] runtime_draft_len = o_next_draft_tokens.shape[1] + self._maybe_log_count_debug_path(outputs) + + penalty_token_counts = outputs.get("penalty_token_counts") + if penalty_token_counts is not None: + from .one_model_sampler import append_accepted_tokens_to_counts + self._debug_dense_counts_before_append( + sampling_requests, penalty_slot_values, o_new_tokens, + o_new_tokens_lens, penalty_token_counts) + append_accepted_tokens_to_counts( + penalty_token_counts, + count_penalty_slots + if count_penalty_slots is not None else fallback_penalty_slots(), + o_new_tokens.contiguous(), + o_new_tokens_lens.contiguous()) + elif (outputs.get("penalty_sparse_token_ids") is not None + and outputs.get("penalty_sparse_token_counts") is not None + and outputs.get("penalty_sparse_count_lens") is not None): + from .one_model_sampler import append_accepted_tokens_to_sparse_counts + self._debug_sparse_counts_before_append( + sampling_requests, penalty_slot_values, o_new_tokens, + o_new_tokens_lens, outputs["penalty_sparse_token_ids"], + outputs["penalty_sparse_token_counts"], + outputs["penalty_sparse_count_lens"], + int(outputs.get("penalty_count_vocab_size", 0))) + append_accepted_tokens_to_sparse_counts( + outputs["penalty_sparse_token_ids"], + outputs["penalty_sparse_token_counts"], + outputs["penalty_sparse_count_lens"], + count_penalty_slots + if count_penalty_slots is not None else fallback_penalty_slots(), + o_new_tokens.contiguous(), + o_new_tokens_lens.contiguous(), + int(outputs.get("penalty_count_vocab_size", 0))) + + penalty_history_tokens = outputs.get("penalty_history_tokens") + penalty_history_lens = outputs.get("penalty_history_lens") + if (penalty_history_tokens is not None + and penalty_history_lens is not None + and not outputs.get("penalty_history_appended", False)): + from .one_model_sampler import append_accepted_tokens_to_history + append_accepted_tokens_to_history( + penalty_history_tokens, + penalty_history_lens, + history_penalty_slots + if history_penalty_slots is not None else fallback_penalty_slots(), + o_new_tokens.contiguous(), + o_new_tokens_lens.contiguous()) + + o_new_tokens = o_new_tokens.index_select(0, valid_positions_cuda) + o_new_tokens_lens = o_new_tokens_lens.index_select(0, valid_positions_cuda) + o_next_draft_tokens = o_next_draft_tokens.index_select(0, valid_positions_cuda) + o_next_new_tokens = o_next_new_tokens.index_select(0, valid_positions_cuda) + # Pad to match fixed-size store buffers for index_copy_. if o_new_tokens.shape[1] < (self.draft_len + 1): o_new_tokens = torch.nn.functional.pad( @@ -275,10 +696,11 @@ def sample_async( # Add dummy draft tokens to context requests for KV cache preparation if self._add_dummy_draft_tokens(): for request in finished_context_requests: - request.py_draft_tokens = [1] * self.draft_len + if self._is_valid_slot(request): + request.py_draft_tokens = [1] * self.draft_len return SampleStateSpec( - requests=sampling_requests, + requests=valid_sampling_requests, device=device_tensors, host=host_tensors, sampler_event=sampler_event, diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py index 33f37706399..0e14ace483b 100644 --- a/tensorrt_llm/_torch/speculative/utils.py +++ b/tensorrt_llm/_torch/speculative/utils.py @@ -34,7 +34,8 @@ def get_spec_metadata(spec_config, max_num_tokens, spec_resource_manager=None, is_draft_model=False, - max_seq_len=262144): + max_seq_len=262144, + max_num_sequence_slots: Optional[int] = None): if spec_config.spec_dec_mode.is_mtp_one_model(): return MTPSpecMetadata( max_draft_len=spec_config.max_draft_len, @@ -42,6 +43,7 @@ def get_spec_metadata(spec_config, spec_dec_mode=spec_config.spec_dec_mode, mtp_num_modules=spec_config.num_nextn_predict_layers, max_num_requests=max_num_requests, + max_num_sequence_slots=max_num_sequence_slots, mtp_hidden_states_manager=spec_resource_manager, allow_advanced_sampling=spec_config.allow_advanced_sampling, ) @@ -51,6 +53,7 @@ def get_spec_metadata(spec_config, max_total_draft_tokens=spec_config.tokens_per_gen_step - 1, spec_dec_mode=spec_config.spec_dec_mode, max_num_requests=max_num_requests, + max_num_sequence_slots=max_num_sequence_slots, num_layers=model_config.num_hidden_layers, hidden_size=model_config.hidden_size, max_num_tokens=max_num_tokens, @@ -66,6 +69,7 @@ def get_spec_metadata(spec_config, max_total_draft_tokens=spec_config.tokens_per_gen_step - 1, spec_dec_mode=spec_config.spec_dec_mode, max_num_requests=max_num_requests, + max_num_sequence_slots=max_num_sequence_slots, num_layers=model_config.num_hidden_layers, hidden_size=model_config.hidden_size, max_num_tokens=max_num_tokens, @@ -85,6 +89,7 @@ def get_spec_metadata(spec_config, max_total_draft_tokens=spec_config.tokens_per_gen_step - 1, spec_dec_mode=spec_config.spec_dec_mode, max_num_requests=max_num_requests, + max_num_sequence_slots=max_num_sequence_slots, num_layers=model_config.num_hidden_layers, hidden_size=model_config.hidden_size, max_num_tokens=max_num_tokens, @@ -98,6 +103,7 @@ def get_spec_metadata(spec_config, max_total_draft_tokens=spec_config.tokens_per_gen_step - 1, spec_dec_mode=spec_config.spec_dec_mode, max_num_requests=max_num_requests, + max_num_sequence_slots=max_num_sequence_slots, allow_advanced_sampling=spec_config.allow_advanced_sampling, spec_resource_manager=spec_resource_manager, ) @@ -107,6 +113,7 @@ def get_spec_metadata(spec_config, max_total_draft_tokens=spec_config.max_total_draft_tokens, spec_dec_mode=spec_config.spec_dec_mode, max_num_requests=max_num_requests, + max_num_sequence_slots=max_num_sequence_slots, max_num_tokens=max_num_tokens, allow_advanced_sampling=spec_config.allow_advanced_sampling, ) @@ -116,6 +123,7 @@ def get_spec_metadata(spec_config, max_total_draft_tokens=1, spec_dec_mode=spec_config.spec_dec_mode, max_num_requests=max_num_requests, + max_num_sequence_slots=max_num_sequence_slots, num_model_layers=model_config.num_hidden_layers, hidden_size=model_config.hidden_size, max_num_tokens=max_num_tokens, @@ -129,6 +137,7 @@ def get_spec_metadata(spec_config, max_total_draft_tokens=spec_config.max_total_draft_tokens, spec_dec_mode=spec_config.spec_dec_mode, max_num_requests=max_num_requests, + max_num_sequence_slots=max_num_sequence_slots, sa_manager=spec_resource_manager, max_matching_ngram_size=spec_config.max_matching_ngram_size, ) @@ -140,6 +149,7 @@ def get_spec_metadata(spec_config, max_total_draft_tokens=spec_config.tokens_per_gen_step - 1, spec_dec_mode=spec_config.spec_dec_mode, max_num_requests=max_num_requests, + max_num_sequence_slots=max_num_sequence_slots, ) return None