mtmd : fix the calculation of n_tokens for smolvlm (#13381)

Co-authored-by: Taichi Nishimura <Taichi.A.Nishimura@sony.com>
2025-06-27 12:05:03 +00:00 · 2025-05-08 22:03:53 +09:00
parent 6562e5a4d6
commit 0ccc121354
1 changed files with 1 additions and 1 deletions
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -3010,7 +3010,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
        n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
    } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
-        n_patches /= params.proj_scale_factor;
+        n_patches /= (params.proj_scale_factor * params.proj_scale_factor);
    } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
        int n_merge = params.spatial_merge_size;
        int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);