mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-28 20:25:20 +00:00
Force FP32 compute in GLM4 FFN Down (#13101)
* Force FP32 compute in cuBLAS GEMM
* Revert "Force FP32 compute in cuBLAS GEMM"
This reverts commit 6efd872732
.
* Force F32 compute in GLM4 ffn down
* Edit comment to clarify issue
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
@ -803,6 +803,10 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||||||
|
|
||||||
if (down) {
|
if (down) {
|
||||||
cur = build_lora_mm(down, cur);
|
cur = build_lora_mm(down, cur);
|
||||||
|
if (arch == LLM_ARCH_GLM4) {
|
||||||
|
// GLM4 seems to have numerical issues with half-precision accumulators
|
||||||
|
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (down_b) {
|
if (down_b) {
|
||||||
|
Reference in New Issue
Block a user