model : do not repack if a GPU device is present (#12498)

ggml-ci
2025-06-29 04:35:05 +00:00 · 2025-03-21 16:14:29 +02:00
parent 960e726077
commit af04481e6b
1 changed files with 23 additions and 10 deletions
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -271,7 +271,17 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
        }
    }

-    // add extra buffer types
+    bool has_gpu_device = false;
+    for (auto * dev : devices) {
+        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+            has_gpu_device = true;
+            break;
+        }
+    }
+
+    // add extra buffer types, only if no GPU device is present
+    // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
+    if (!has_gpu_device) {
        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
@ -283,6 +293,9 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
                ++extra_bufts;
            }
        }
+    } else {
+        LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
+    }

    // add a host buffer type
    // storing the tensors in a host buffer is useful when the processing of large batches