feat: Allow custom layer filters for hybrid recurrent

This should help support architectures like Falcon H1 where there is overlap between layers that need attention and recurrent caches. https://github.com/ggml-org/llama.cpp/pull/13979#discussion_r2140748922 Branch: HybridRecurrentCache Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2025-08-08 18:04:54 -04:00 · 2025-06-11 13:41:52 -06:00
parent d5d7628b5f
commit d8c929ff5d
2 changed files with 46 additions and 32 deletions
--- a/src/llama-kv-cache-hybrid-recurrent.cpp
+++ b/src/llama-kv-cache-hybrid-recurrent.cpp
@@ -24,11 +24,16 @@ llama_kv_cache_hybrid_recurrent::llama_kv_cache_hybrid_recurrent(
             uint32_t    recurrent_kv_size,
                         /* common */
             uint32_t    n_seq_max,
-                 bool   offload) :
+                 bool    offload,
                         /* layer filters */
      layer_filter_cb && attn_filter,
      layer_filter_cb && recurrent_filter) :
    hparams(model.hparams),
    kv_attn(new llama_kv_cache_unified(
        model,
-        [&](int32_t il) { return !model.hparams.recurrent_layer(il); },
+        attn_filter == nullptr ?
            [&](int32_t il) { return !model.hparams.recurrent_layer(il); }
            : attn_filter,
        attn_type_k,
        attn_type_v,
        attn_v_trans,
@@ -41,7 +46,9 @@ llama_kv_cache_hybrid_recurrent::llama_kv_cache_hybrid_recurrent(
    )),
    kv_recurrent(new llama_kv_cache_recurrent(
        model,
-        [&](int32_t il) { return model.hparams.recurrent_layer(il); },
+        recurrent_filter == nullptr ?
            [&](int32_t il) { return model.hparams.recurrent_layer(il); }
            : recurrent_filter,
        recurrent_type_k,
        recurrent_type_v,
        offload,
--- a/src/llama-kv-cache-hybrid-recurrent.h
+++ b/src/llama-kv-cache-hybrid-recurrent.h
@@ -19,6 +19,10 @@
 class llama_kv_cache_hybrid_recurrent : public llama_memory_i {
 public:
    // this callback is used to filter out layers that should not be included in the cache
    using layer_filter_cb = std::function<bool(int32_t il)>;
    llama_kv_cache_hybrid_recurrent(
            const llama_model & model,
                                 /* attn */
@@ -35,7 +39,10 @@ public:
                     uint32_t    recurrent_kv_size,
                                 /* common */
                     uint32_t    n_seq_max,
-                         bool   offload);
+                         bool    offload,
                                 /* layer filters */
              layer_filter_cb && attn_filter      = nullptr,
              layer_filter_cb && recurrent_filter = nullptr);
    ~llama_kv_cache_hybrid_recurrent() = default;