llama : add option to override model tensor buffers (#11397)

* llama : add option to override tensor buffers

* ggml : fix possible underflow in ggml_nbytes
This commit is contained in:
Diego Devesa
2025-04-02 14:52:01 +02:00
committed by GitHub
parent a10b36c91a
commit e0e912f49b
12 changed files with 108 additions and 9 deletions

View File

@ -255,7 +255,8 @@ llama_context::llama_context(
model.n_devices() > 1 &&
model.params.n_gpu_layers > (int) model.hparams.n_layer &&
model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
cparams.offload_kqv;
cparams.offload_kqv &&
!model.has_tensor_overrides();
// pipeline parallelism requires support for async compute and events in all devices
if (pipeline_parallel) {