feat(core): add verbose debug logging to longest_token_prefix fast paths

JamePeng · JamePeng · commit 90d46dcab450 · 2026-03-15T20:48:07.000+09:00
- Added an optional `verbose` parameter to `Llama.longest_token_prefix` to explicitly log early-exit conditions. This provides crucial visibility into cache-miss behaviors during debugging by outputting the specific reason for a fast exit (e.g., empty sequence vs. mismatched first token) along with the offending sequence lengths or token values.
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1210,7 +1210,7 @@ def generate(
         # Check for kv cache prefix match
         if reset and self.n_tokens > 0:
             # 1. First, check for a 100% exact match of the entire sequence
-            full_match_prefix = self.longest_token_prefix(self._input_ids, tokens)
+            full_match_prefix = self.longest_token_prefix(self._input_ids, tokens, self.verbose)
 
             # --- FAST PATH: Zero-latency bypass for Hybrid Single-Turn & Multimodal ---
             # If the cache is disabled (max_checkpoints <= 0) and we have a 100% match,
@@ -1233,7 +1233,7 @@ def generate(
             else:
                 # By matching against `tokens[:-1]`, we intentionally drop the last token.
                 # This forces the engine to re-evaluate the final token to refresh sampling logits.
-                longest_prefix = self.longest_token_prefix(self._input_ids, tokens[:-1])
+                longest_prefix = self.longest_token_prefix(self._input_ids, tokens[:-1], self.verbose)
 
                 if longest_prefix > 0:
                     reset = False
@@ -1840,10 +1840,10 @@ def _create_completion(
             try:
                 cache_item = self.cache[prompt_tokens]
                 cache_prefix_len = Llama.longest_token_prefix(
-                    cache_item.input_ids, prompt_tokens
+                    cache_item.input_ids, prompt_tokens, self.verbose
                 )
                 eval_prefix_len = Llama.longest_token_prefix(
-                    self._input_ids, prompt_tokens
+                    self._input_ids, prompt_tokens, self.verbose
                 )
                 if cache_prefix_len > eval_prefix_len:
                     self.load_state(cache_item)
@@ -2996,7 +2996,8 @@ def logits_to_logprobs(
     @staticmethod
     def longest_token_prefix(
         current_ids: Union[Sequence[int], npt.NDArray[np.intc]],
-        new_tokens: Union[Sequence[int], npt.NDArray[np.intc]]
+        new_tokens: Union[Sequence[int], npt.NDArray[np.intc]],
+        verbose: bool = False
     ) -> int:
         """
         Calculates the length of the longest common prefix between two token sequences.
@@ -3008,12 +3009,19 @@ def longest_token_prefix(
         Args:
             current_ids: The existing token sequence (e.g., KV cache).
             new_tokens: The new input token sequence.
+            verbose: If True, prints detailed debug information to stderr.
 
         Returns:
             int: The number of matching tokens from the start.
         """
         # Fast exit for empty sequences to avoid unnecessary processing
         if len(current_ids) == 0 or len(new_tokens) == 0:
+            if verbose:
+                print(
+                    f"Llama.longest_token_prefix [Fast Exit 1]: Empty sequence detected. "
+                    f"len(current_ids)={len(current_ids)}, len(new_tokens)={len(new_tokens)}",
+                    file=sys.stderr
+                )
             return 0
 
         # Determine the comparison range (limited by the shorter sequence)
@@ -3022,6 +3030,12 @@ def longest_token_prefix(
         # Probe inspection: Use Python to quickly compare the first token
         # If the tokens are different from the beginning, return immediately to avoid any NumPy overhead.
         if current_ids[0] != new_tokens[0]:
+            if verbose:
+                print(
+                    f"Llama.longest_token_prefix [Fast Exit 2]: First token mismatch. "
+                    f"current_ids[0]={current_ids[0]} vs new_tokens[0]={new_tokens[0]}",
+                    file=sys.stderr
+                )
             return 0
 
         # Accelerating SIMD for Large Data Volumes
diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py
@@ -59,12 +59,13 @@ class LlamaDiskCache(BaseLlamaCache):
     """
 
     def __init__(
-        self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30)
+        self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30), verbose: bool = False
     ):
         super().__init__(capacity_bytes)
         self.cache_dir = cache_dir
         # Native SQLite size limit and LRU eviction
         self.cache = diskcache.Cache(cache_dir, size_limit=capacity_bytes)
+        self.verbose = verbose
 
     @property
     def cache_size(self):
@@ -83,7 +84,7 @@ def _find_longest_prefix_key(
         min_key: Optional[Tuple[int, ...]] = None
         target_len = len(key)
         for k in self.cache.iterkeys():  # type: ignore
-            prefix_len = llama_core.Llama.longest_token_prefix(k, key)
+            prefix_len = llama_core.Llama.longest_token_prefix(k, key, self.verbose)
             if prefix_len > min_len:
                 min_len = prefix_len
                 min_key = k  # type: ignore
@@ -123,13 +124,14 @@ class LlamaRAMCache(BaseLlamaCache):
     Maintains an LRU eviction policy with O(1) size tracking.
     """
 
-    def __init__(self, capacity_bytes: int = (2 << 30)):
+    def __init__(self, capacity_bytes: int = (2 << 30), verbose: bool = False):
         super().__init__(capacity_bytes)
         self.capacity_bytes = capacity_bytes
         self.cache_state: OrderedDict[
             Tuple[int, ...], "llama_core.LlamaState"
         ] = OrderedDict()
         self._current_size = 0
+        self.verbose = verbose
 
     @property
     def cache_size(self):
@@ -142,7 +144,7 @@ def _find_longest_prefix_key(
         min_len = 0
         min_key = None
         keys = (
-            (k, llama_core.Llama.longest_token_prefix(k, key))
+            (k, llama_core.Llama.longest_token_prefix(k, key, self.verbose))
             for k in self.cache_state.keys()
         )
         for k, prefix_len in keys:
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -3206,7 +3206,7 @@ def _create_bitmap_func(idx: int, item: str):
                         media_id = -314159
 
                     if self.verbose:
-                        print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens:{chunk_n_tokens}, media_id: {media_id}, ")
+                        print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ")
 
                     chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id))
 
@@ -3303,7 +3303,7 @@ def __call__(
             # 3. KV Cache Synchronization & State Rollback
             # Compares the virtual ledger with physical history to prevent Cache Poisoning.
             current_history = llama.input_ids[:llama.n_tokens].tolist()
-            longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids)
+            longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose)
 
             if longest_prefix < llama.n_tokens:
                 if llama.is_hybrid and llama._hybrid_cache_mgr is not None: