ggml : fix fallback to CPU for ununsupported ops (#15118)

This commit is contained in:
Diego Devesa
2025-08-06 05:37:35 -07:00
committed by GitHub
parent 65c797c4fa
commit 0d8831543c
4 changed files with 27 additions and 25 deletions

View File

@@ -1071,6 +1071,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
} }
// if the node is still unassigned, assign it to the first backend that supports it
for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
}
GGML_ASSERT(*cur_backend_id != -1);
} }
// pass 5: split graph, find tensors that need to be copied // pass 5: split graph, find tensors that need to be copied
@@ -1098,7 +1103,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
const int node_backend_id = tensor_backend_id(node); const int node_backend_id = tensor_backend_id(node);
assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
// check if we should start a new split based on the sources of the current node // check if we should start a new split based on the sources of the current node
bool need_new_split = false; bool need_new_split = false;
@@ -1156,7 +1161,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
size_t src_id = hash_id(src); size_t src_id = hash_id(src);
const int src_backend_id = sched->hv_tensor_backend_ids[src_id]; const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
assert(src_backend_id != -1); // all inputs should be assigned by now GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) { if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {

View File

@@ -35,7 +35,7 @@
// ggml-backend interface // ggml-backend interface
std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() { std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types() {
static std::vector<ggml_backend_buffer_type_t> bufts = []() { static std::vector<ggml_backend_buffer_type_t> bufts = []() {
std::vector<ggml_backend_buffer_type_t> bufts; std::vector<ggml_backend_buffer_type_t> bufts;
@@ -57,8 +57,6 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
} }
#endif #endif
bufts.push_back(NULL);
return bufts; return bufts;
}(); }();
@@ -66,14 +64,20 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
} }
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) { static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
return ggml_backend_cpu_get_extra_buffers_type().data(); static std::vector<ggml_backend_buffer_type_t> extra_bufts = [] {
std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types();
bufts.push_back(nullptr);
return bufts;
}();
return extra_bufts.data();
GGML_UNUSED(device); GGML_UNUSED(device);
} }
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) { static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) { for (auto * extra : ggml_backend_cpu_get_extra_buffer_types()) {
if (extra && extra == buft) { if (extra == buft) {
return true; return true;
} }
} }
@@ -397,20 +401,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
return true; return true;
} }
// extra_buffer_op? // check extra buffer types
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { // note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary
if (extra) { for (int i = 0; i < 4; i++) {
auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context; if (op->src[i] && op->src[i]->buffer &&
if (buf_extra && buf_extra->supports_op(dev, op)) { ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) {
return true; auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context;
} return buf_extra->supports_op(dev, op);
}
}
// the other case need host buffer.
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
return false;
} }
} }

View File

@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
} // namespace ggml::cpu } // namespace ggml::cpu
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) { bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
if (extra && extra->context) { if (extra && extra->context) {
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context; auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
auto tensor_traits = buf_extra->get_tensor_traits(op); auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
} }
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) { bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
if (extra && extra->context) { if (extra && extra->context) {
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context; auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
auto tensor_traits = buf_extra->get_tensor_traits(op); auto tensor_traits = buf_extra->get_tensor_traits(op);

View File

@@ -33,6 +33,6 @@ class extra_buffer_type {
} // namespace ggml::cpu } // namespace ggml::cpu
// implemented in ggml-cpu.cpp. // implemented in ggml-cpu.cpp.
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type(); std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
#endif #endif