Skip to content

Commit 90d46dc

Browse files
committed
feat(core): add verbose debug logging to longest_token_prefix fast paths
- Added an optional `verbose` parameter to `Llama.longest_token_prefix` to explicitly log early-exit conditions. This provides crucial visibility into cache-miss behaviors during debugging by outputting the specific reason for a fast exit (e.g., empty sequence vs. mismatched first token) along with the offending sequence lengths or token values.
1 parent 35957bd commit 90d46dc

3 files changed

Lines changed: 27 additions & 11 deletions

File tree

llama_cpp/llama.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1210,7 +1210,7 @@ def generate(
12101210
# Check for kv cache prefix match
12111211
if reset and self.n_tokens > 0:
12121212
# 1. First, check for a 100% exact match of the entire sequence
1213-
full_match_prefix = self.longest_token_prefix(self._input_ids, tokens)
1213+
full_match_prefix = self.longest_token_prefix(self._input_ids, tokens, self.verbose)
12141214

12151215
# --- FAST PATH: Zero-latency bypass for Hybrid Single-Turn & Multimodal ---
12161216
# If the cache is disabled (max_checkpoints <= 0) and we have a 100% match,
@@ -1233,7 +1233,7 @@ def generate(
12331233
else:
12341234
# By matching against `tokens[:-1]`, we intentionally drop the last token.
12351235
# This forces the engine to re-evaluate the final token to refresh sampling logits.
1236-
longest_prefix = self.longest_token_prefix(self._input_ids, tokens[:-1])
1236+
longest_prefix = self.longest_token_prefix(self._input_ids, tokens[:-1], self.verbose)
12371237

12381238
if longest_prefix > 0:
12391239
reset = False
@@ -1840,10 +1840,10 @@ def _create_completion(
18401840
try:
18411841
cache_item = self.cache[prompt_tokens]
18421842
cache_prefix_len = Llama.longest_token_prefix(
1843-
cache_item.input_ids, prompt_tokens
1843+
cache_item.input_ids, prompt_tokens, self.verbose
18441844
)
18451845
eval_prefix_len = Llama.longest_token_prefix(
1846-
self._input_ids, prompt_tokens
1846+
self._input_ids, prompt_tokens, self.verbose
18471847
)
18481848
if cache_prefix_len > eval_prefix_len:
18491849
self.load_state(cache_item)
@@ -2996,7 +2996,8 @@ def logits_to_logprobs(
29962996
@staticmethod
29972997
def longest_token_prefix(
29982998
current_ids: Union[Sequence[int], npt.NDArray[np.intc]],
2999-
new_tokens: Union[Sequence[int], npt.NDArray[np.intc]]
2999+
new_tokens: Union[Sequence[int], npt.NDArray[np.intc]],
3000+
verbose: bool = False
30003001
) -> int:
30013002
"""
30023003
Calculates the length of the longest common prefix between two token sequences.
@@ -3008,12 +3009,19 @@ def longest_token_prefix(
30083009
Args:
30093010
current_ids: The existing token sequence (e.g., KV cache).
30103011
new_tokens: The new input token sequence.
3012+
verbose: If True, prints detailed debug information to stderr.
30113013
30123014
Returns:
30133015
int: The number of matching tokens from the start.
30143016
"""
30153017
# Fast exit for empty sequences to avoid unnecessary processing
30163018
if len(current_ids) == 0 or len(new_tokens) == 0:
3019+
if verbose:
3020+
print(
3021+
f"Llama.longest_token_prefix [Fast Exit 1]: Empty sequence detected. "
3022+
f"len(current_ids)={len(current_ids)}, len(new_tokens)={len(new_tokens)}",
3023+
file=sys.stderr
3024+
)
30173025
return 0
30183026

30193027
# Determine the comparison range (limited by the shorter sequence)
@@ -3022,6 +3030,12 @@ def longest_token_prefix(
30223030
# Probe inspection: Use Python to quickly compare the first token
30233031
# If the tokens are different from the beginning, return immediately to avoid any NumPy overhead.
30243032
if current_ids[0] != new_tokens[0]:
3033+
if verbose:
3034+
print(
3035+
f"Llama.longest_token_prefix [Fast Exit 2]: First token mismatch. "
3036+
f"current_ids[0]={current_ids[0]} vs new_tokens[0]={new_tokens[0]}",
3037+
file=sys.stderr
3038+
)
30253039
return 0
30263040

30273041
# Accelerating SIMD for Large Data Volumes

llama_cpp/llama_cache.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,13 @@ class LlamaDiskCache(BaseLlamaCache):
5959
"""
6060

6161
def __init__(
62-
self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30)
62+
self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30), verbose: bool = False
6363
):
6464
super().__init__(capacity_bytes)
6565
self.cache_dir = cache_dir
6666
# Native SQLite size limit and LRU eviction
6767
self.cache = diskcache.Cache(cache_dir, size_limit=capacity_bytes)
68+
self.verbose = verbose
6869

6970
@property
7071
def cache_size(self):
@@ -83,7 +84,7 @@ def _find_longest_prefix_key(
8384
min_key: Optional[Tuple[int, ...]] = None
8485
target_len = len(key)
8586
for k in self.cache.iterkeys(): # type: ignore
86-
prefix_len = llama_core.Llama.longest_token_prefix(k, key)
87+
prefix_len = llama_core.Llama.longest_token_prefix(k, key, self.verbose)
8788
if prefix_len > min_len:
8889
min_len = prefix_len
8990
min_key = k # type: ignore
@@ -123,13 +124,14 @@ class LlamaRAMCache(BaseLlamaCache):
123124
Maintains an LRU eviction policy with O(1) size tracking.
124125
"""
125126

126-
def __init__(self, capacity_bytes: int = (2 << 30)):
127+
def __init__(self, capacity_bytes: int = (2 << 30), verbose: bool = False):
127128
super().__init__(capacity_bytes)
128129
self.capacity_bytes = capacity_bytes
129130
self.cache_state: OrderedDict[
130131
Tuple[int, ...], "llama_core.LlamaState"
131132
] = OrderedDict()
132133
self._current_size = 0
134+
self.verbose = verbose
133135

134136
@property
135137
def cache_size(self):
@@ -142,7 +144,7 @@ def _find_longest_prefix_key(
142144
min_len = 0
143145
min_key = None
144146
keys = (
145-
(k, llama_core.Llama.longest_token_prefix(k, key))
147+
(k, llama_core.Llama.longest_token_prefix(k, key, self.verbose))
146148
for k in self.cache_state.keys()
147149
)
148150
for k, prefix_len in keys:

llama_cpp/llama_chat_format.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3206,7 +3206,7 @@ def _create_bitmap_func(idx: int, item: str):
32063206
media_id = -314159
32073207

32083208
if self.verbose:
3209-
print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens:{chunk_n_tokens}, media_id: {media_id}, ")
3209+
print(f"{self.log_prefix}(mtmd_input_chunk_media_id): chunk_n_tokens: {chunk_n_tokens}, media_id: {media_id}, ")
32103210

32113211
chunk_token_spans.append((current_idx, current_idx + chunk_n_tokens, chunk, chunk_type, media_id))
32123212

@@ -3303,7 +3303,7 @@ def __call__(
33033303
# 3. KV Cache Synchronization & State Rollback
33043304
# Compares the virtual ledger with physical history to prevent Cache Poisoning.
33053305
current_history = llama.input_ids[:llama.n_tokens].tolist()
3306-
longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids)
3306+
longest_prefix = llama.longest_token_prefix(current_history, full_prompt_ids, self.verbose)
33073307

33083308
if longest_prefix < llama.n_tokens:
33093309
if llama.is_hybrid and llama._hybrid_cache_mgr is not None:

0 commit comments

Comments
 (0)