kv-cache : log (debug) all streams in find_slot (#15176)

This commit updates `llama_kv_cache_unified::find_slot` to log information for all streams when debug is enabled. The motivation for this change is that currently if a non-unified kv-cache is used, then only one stream will be logged because the code was currently uses `seq_to_stream[1]`.
2025-08-17 13:40:55 -04:00 · 2025-08-11 11:21:19 +02:00
parent 50e81bdf5d
commit cd3069dfcb
1 changed files with 56 additions and 52 deletions
--- a/src/llama-kv-cache-unified.cpp
+++ b/src/llama-kv-cache-unified.cpp
@@ -738,66 +738,70 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
 }

 llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
+
    if (debug > 0) {
-        const auto & cells = v_cells[seq_to_stream[1]];
+        for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+            const auto seq_id = ubatch.seq_id_unq[s];
+            const auto stream_id = seq_to_stream[seq_id];
+            const auto & cells = v_cells[stream_id];
+            const uint32_t head_cur = v_heads[stream_id];

-        const uint32_t head_cur = v_heads[1];
+            LLAMA_LOG_DEBUG("%s: stream[%d], n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
+                    __func__, stream_id, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);

-        LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
-                __func__, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
-
-        if ((debug == 2 && n_swa > 0) || debug > 2) {
-            std::string ss;
-            for (uint32_t i = 0; i < cells.size(); ++i) {
-                if (cells.is_empty(i)) {
-                    ss += '.';
-                } else {
-                    assert(cells.seq_count(i) >= 1);
-
-                    if (cells.seq_count(i) == 1) {
-                        ss += std::to_string(cells.seq_get(i));
+            if ((debug == 2 && n_swa > 0) || debug > 2) {
+                std::string ss;
+                for (uint32_t i = 0; i < cells.size(); ++i) {
+                    if (cells.is_empty(i)) {
+                        ss += '.';
                    } else {
-                        ss += 'M';
+                        assert(cells.seq_count(i) >= 1);
+
+                        if (cells.seq_count(i) == 1) {
+                            ss += std::to_string(cells.seq_get(i));
+                        } else {
+                            ss += 'M';
+                        }
+                    }
+                    if (i%256 == 255) {
+                        ss += " *";
+                        ss += '\n';
                    }
                }
-                if (i%256 == 255) {
-                    ss += " *";
-                    ss += '\n';
-                }
-            }
-            LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
-        }
-
-        if ((debug == 2 && n_swa > 0) || debug > 2) {
-            std::string ss;
-            for (uint32_t i = 0; i < cells.size(); ++i) {
-                std::string cur;
-                if (cells.is_empty(i)) {
-                    cur = '.';
-                } else {
-                    cur = std::to_string(cells.pos_get(i));
-                }
-                const int n = cur.size();
-                for (int j = 0; j < 5 - n; ++j) {
-                    cur += ' ';
-                }
-                ss += cur;
-                if (i%256 == 255) {
-                    ss += " *";
-                }
-                if (i%64 == 63) {
-                    ss += '\n';
-                }
-            }
-            LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
-        }
-
-        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
-            if (cells.seq_pos_min(s) < 0) {
-                continue;
+                LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
            }

-            LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
+            if ((debug == 2 && n_swa > 0) || debug > 2) {
+                std::string ss;
+                for (uint32_t i = 0; i < cells.size(); ++i) {
+                    std::string cur;
+                    if (cells.is_empty(i)) {
+                        cur = '.';
+                    } else {
+                        cur = std::to_string(cells.pos_get(i));
+                    }
+                    const int n = cur.size();
+                    for (int j = 0; j < 5 - n; ++j) {
+                        cur += ' ';
+                    }
+                    ss += cur;
+                    if (i%256 == 255) {
+                        ss += " *";
+                    }
+                    if (i%64 == 63) {
+                        ss += '\n';
+                    }
+                }
+                LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
+            }
+
+            for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+                if (cells.seq_pos_min(s) < 0) {
+                    continue;
+                }
+
+                LLAMA_LOG_DEBUG("%s: stream[%d] min[%d] = %5d, max[%d] = %5d\n", __func__, stream_id, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
+            }
        }
    }