CUDA: add bf16 and f32 support to cublas_mul_mat_batched (#14361)

* CUDA: add bf16 and f32 support to cublas_mul_mat_batched

* Review: add type traits and make function more generic

* Review: make check more explicit, add back comments, and fix formatting

* Review: fix formatting, remove useless type conversion, fix naming for bools
This commit is contained in:
Aman Gupta
2025-06-29 01:30:53 +08:00
committed by GitHub
parent 63a7bb3c7e
commit 27208bf657
4 changed files with 162 additions and 78 deletions

View File

@ -4425,8 +4425,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
for (auto nr : {1,4}) {
for (uint32_t m = 0; m < 2; ++m) {
for (uint32_t k = 0; k < 2; ++k) {
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, 1}, {nr, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, 1}, {nr, 1}, {0, 1, 2, 3}, true));
for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, 1}, {nr, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, 1}, {nr, 1}, {0, 1, 2, 3}, true));
}
}
}
}