mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-29 05:33:37 -04:00
CUDA: fix partial offloading for ne0 % 256 != 0 (#8572)
This commit is contained in:
@@ -464,12 +464,12 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
||||
return;
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
// initialize padding to 0 to avoid possible NaN values
|
||||
size_t original_size = ggml_nbytes(tensor);
|
||||
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||
|
||||
if (padded_size > original_size && tensor->view_src == nullptr) {
|
||||
if (padded_size > original_size) {
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
||||
}
|
||||
@@ -1485,6 +1485,13 @@ static void ggml_cuda_op_mul_mat(
|
||||
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
|
||||
}
|
||||
|
||||
// If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
|
||||
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
||||
const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
||||
const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
||||
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
|
||||
}
|
||||
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
dev[id].src1_ddf = (float *) src1->data;
|
||||
} else {
|
||||
|
Reference in New Issue
Block a user