llama : allow other bufts when overriding to CPU, add --no-repack option (#14990)

2025-08-14 20:29:41 -04:00 · 2025-07-31 09:11:34 -07:00
parent e08a98826b
commit d6818d06a6
5 changed files with 39 additions and 21 deletions
--- a/include/llama.h
+++ b/include/llama.h
@@ -284,10 +284,11 @@ extern "C" {
        const struct llama_model_kv_override * kv_overrides;

        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool vocab_only;    // only load the vocabulary, no weights
-        bool use_mmap;      // use mmap if possible
-        bool use_mlock;     // force system to keep model in RAM
-        bool check_tensors; // validate model tensor data
+        bool vocab_only;      // only load the vocabulary, no weights
+        bool use_mmap;        // use mmap if possible
+        bool use_mlock;       // force system to keep model in RAM
+        bool check_tensors;   // validate model tensor data
+        bool use_extra_bufts; // use extra buffer types (used for weight repacking)
    };

    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations