A

yizhang-nv · yizhang-nv · commit 44b6a1ade1e9 · 2026-02-23T19:53:07.000-08:00
Signed-off-by: yizhang-nv &lt;187001205+yizhang-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
@@ -322,7 +322,7 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
         .def_rw("missed_blocks", &tbk::KvCacheStats::missedBlocks)
         .def_rw("cache_hit_rate", &tbk::KvCacheStats::cacheHitRate)
         .def_rw("num_free_blocks_per_window_size", &tbk::KvCacheStats::numFreeBlocksPerWindowSize)
-        .def_ro("allocated_bytes", &tbk::KvCacheStats::allocatedBytes);
+        .def_rw("allocated_bytes", &tbk::KvCacheStats::allocatedBytes);
 
     nb::class_<tbk::TempAttentionWindowInputs>(m, "TempAttentionWindowInputs")
         .def(nb::init<>())
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -16,6 +16,7 @@
 from tensorrt_llm._utils import (TensorWrapper, convert_to_torch_tensor,
                                  get_size_in_bytes, mpi_comm, mpi_disabled,
                                  torch_comm)
+from tensorrt_llm.bindings.internal.batch_manager import KvCacheStats
 from tensorrt_llm.bindings.internal.batch_manager.kv_cache_manager_v2_utils import (
     IndexMapper, copy_batch_block_offsets_to_device)
 from tensorrt_llm.bindings.internal.runtime import TaskLayerModuleConfig
@@ -1961,13 +1962,14 @@ def _kv_connector_should_add_sequence(self, request: LlmRequest) -> bool:
             request)
 
     def get_kv_cache_stats(self):
+        # TODO: Remove this once we have a proper way to shutdown the kv cache manager
+        if hasattr(self, "kv_cache_stats"):
+            return self.kv_cache_stats
 
-        class KVCacheStatus:
+        kv_cache_stats = KvCacheStats()
+        kv_cache_stats.allocated_bytes = self.impl.get_quota(GPU_LEVEL)
 
-            def __init__(self, allocated_bytes: int):
-                self.allocated_bytes = allocated_bytes
-
-        return KVCacheStatus(allocated_bytes=self.impl.get_quota(GPU_LEVEL))
+        return kv_cache_stats
 
     def get_block_ids_per_seq(self, request_ids: List[int]) -> torch.Tensor:
         block_ids_per_seq = self.get_batch_cache_indices(request_ids)
@@ -2208,7 +2210,11 @@ def shutdown(self):
         for kv_cache in self.kv_cache_map.values():
             kv_cache.close()
         self.kv_cache_map.clear()
-        self.impl.clear_reusable_blocks()
+        self.kv_cache_stats = self.get_kv_cache_stats()
+        if hasattr(self, "impl"):
+            # TODO: Use self.impl.shutdown() instead of del self.impl
+            self.impl.clear_reusable_blocks()
+            del self.impl
 
     def get_max_resource_count(self) -> int:
         # TODO: implement this
@@ -2279,7 +2285,8 @@ def update_resources(self,
                         req.get_tokens(DEFAULT_BEAM_INDEX)
                         [kv_cache.num_committed_tokens:req.
                          context_current_position])
-                kv_cache.stop_committing()
+                if req.context_remaining_length == 0:
+                    kv_cache.stop_committing()
             else:
                 success = kv_cache.resize(None, req.context_current_position)
                 if not success:
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -222,10 +222,15 @@ def get_test_config(test_desc, example_dir, test_root):
 def get_extra_llm_config(config, suffix, cwd):
     extra_llm_config = {
         'orchestrator_type': 'ray',
+        'kv_cache_config': {
+            'use_kv_cache_manager_v2': False
+        }
     }
     for key, value in config.items():
         if key not in ['num_instances', 'urls']:
             extra_llm_config[key] = value
+            if key == 'kv_cache_config':
+                extra_llm_config[key]['use_kv_cache_manager_v2'] = False
 
     temp_fd, extra_config_file = tempfile.mkstemp(suffix='_%s.yaml' % suffix,
                                                   dir=cwd)
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -172,7 +172,10 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt,
             disable_overlap_scheduler=not generation_overlap,
             cuda_graph_config=CudaGraphConfig() if enable_cuda_graph else None))
 
-    kv_cache_configs = [KvCacheConfig(max_tokens=2048 * 8) for _ in range(2)]
+    kv_cache_configs = [
+        KvCacheConfig(max_tokens=2048 * 8, use_kv_cache_manager_v2=False)
+        for _ in range(2)
+    ]
     cache_transceiver_configs = [
         CacheTransceiverConfig(backend="DEFAULT") for _ in range(2)
     ]
@@ -318,8 +321,10 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
             cuda_graph_config=CudaGraphConfig() if enable_cuda_graph else None))
 
     kv_cache_configs = [
-        KvCacheConfig(max_tokens=128, enable_block_reuse=False, dtype="auto")
-        for _ in range(2)
+        KvCacheConfig(max_tokens=128,
+                      enable_block_reuse=False,
+                      dtype="auto",
+                      use_kv_cache_manager_v2=False) for _ in range(2)
     ]
     cache_transceiver_configs = [
         CacheTransceiverConfig(backend="DEFAULT") for _ in range(2)
@@ -429,7 +434,8 @@ def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path,
     kv_cache_configs = [
         KvCacheConfig(max_tokens=128,
                       enable_block_reuse=False,
-                      free_gpu_memory_fraction=0.4) for _ in range(2)
+                      free_gpu_memory_fraction=0.4,
+                      use_kv_cache_manager_v2=False) for _ in range(2)
     ]
     cache_transceiver_configs = [
         CacheTransceiverConfig(backend="DEFAULT") for _ in range(2)