CUDA: stream-k decomposition for MMQ (#8018)

* CUDA: stream-k decomposition for MMQ

* fix undefined memory reads for small matrices
This commit is contained in:
Johannes Gäßler
2024-06-20 14:39:21 +02:00
committed by GitHub
parent 2075a66a96
commit d50f8897a7
4 changed files with 292 additions and 113 deletions

View File

@@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
}
const int cc = ggml_cuda_info().devices[id].cc;
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
}
return row_rounding;
}