2025-05-13 17:07:21 +02:00
|
|
|
#pragma once
|
2023-10-12 18:23:18 +03:00
|
|
|
|
2025-04-05 17:17:40 +02:00
|
|
|
#include "ggml.h"
|
2023-11-06 22:36:23 +01:00
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
|
2025-05-25 14:06:32 +02:00
|
|
|
// !!! Internal header, to be used by mtmd only !!!
|
|
|
|
|
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
|
|
|
struct clip_ctx;
|
2023-10-12 18:23:18 +03:00
|
|
|
|
2024-08-09 18:33:53 +08:00
|
|
|
struct clip_image_size {
|
|
|
|
int width;
|
|
|
|
int height;
|
|
|
|
};
|
|
|
|
|
2025-04-20 03:15:41 -07:00
|
|
|
struct clip_image_f32;
|
2025-04-11 12:09:39 +02:00
|
|
|
struct clip_image_u8_batch;
|
|
|
|
struct clip_image_f32_batch;
|
2023-10-12 18:23:18 +03:00
|
|
|
|
2025-05-27 14:06:10 +02:00
|
|
|
enum clip_modality {
|
|
|
|
CLIP_MODALITY_VISION,
|
|
|
|
CLIP_MODALITY_AUDIO,
|
|
|
|
};
|
|
|
|
|
2025-03-11 09:20:16 +01:00
|
|
|
struct clip_context_params {
|
|
|
|
bool use_gpu;
|
2025-04-20 03:15:41 -07:00
|
|
|
enum ggml_log_level verbosity;
|
2025-03-11 09:20:16 +01:00
|
|
|
};
|
|
|
|
|
2025-05-27 14:06:10 +02:00
|
|
|
struct clip_init_result {
|
|
|
|
struct clip_ctx * ctx_v; // vision context
|
|
|
|
struct clip_ctx * ctx_a; // audio context
|
|
|
|
};
|
|
|
|
|
|
|
|
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
|
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
|
|
|
|
2025-05-13 17:07:21 +02:00
|
|
|
void clip_free(struct clip_ctx * ctx);
|
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
|
|
|
|
2025-05-13 17:07:21 +02:00
|
|
|
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
|
|
|
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
|
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
|
|
|
|
2025-05-13 17:07:21 +02:00
|
|
|
int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
|
|
|
int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
|
|
|
int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
|
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
|
|
|
|
|
|
|
// TODO: should be enum, not string
|
2025-05-13 17:07:21 +02:00
|
|
|
const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
|
|
|
|
2025-05-13 17:07:21 +02:00
|
|
|
const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
|
|
|
size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
|
|
|
|
2025-05-13 17:07:21 +02:00
|
|
|
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
2025-04-29 11:47:04 +02:00
|
|
|
|
|
|
|
// for M-RoPE, this will be the number of token positions in X and Y directions
|
|
|
|
// for other models, X will be the total number of tokens and Y will be 1
|
2025-05-13 17:07:21 +02:00
|
|
|
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
|
|
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
2025-04-29 11:47:04 +02:00
|
|
|
|
|
|
|
// this should be equal to the embedding dimension of the text model
|
2025-05-13 17:07:21 +02:00
|
|
|
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
llava : support v1.6 (#5267)
* Create llava-survery-v2.py
* Update convert-image-encoder-to-gguf.py
* Update convert-image-encoder-to-gguf.py
* Rename llava-survery-v2.py to llava-surgery-v2.py
* Update convert-image-encoder-to-gguf.py
will now search for projector
* Update convert-image-encoder-to-gguf.py
whoops
* Update llava-surgery-v2.py
* Clip: Bugfix for normalization (it did not loat the 3 std and mean values)
Clip: bicubic resize function
Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images
Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6)
Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints
Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported
llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final
convert-image-encoder: fixed image-grid flattening
* whitespace corrections
* ws
* Tensors are now properly permuted.
Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
* ws
* added verbose_prompt support into cli
added stopwords for llava-1.6 into cli
* moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed
* ws
* convert : skip unknown tensors (need for LLaVA)
* llava : update readme
* llava : fix compile warnings
* llava : style
* convert : add --skip-unknown CLI arg
* server : remove clip structs
* bugfix for non llava-1.6
It should now work with llava-1.5 as well
* clip : minor code rearrange
* llava : update readme a bit
---------
Co-authored-by: John <cmt-nct@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-14 08:38:35 +01:00
|
|
|
|
2025-05-13 17:07:21 +02:00
|
|
|
struct clip_image_size * clip_image_size_init(void);
|
|
|
|
struct clip_image_u8 * clip_image_u8_init (void);
|
|
|
|
struct clip_image_f32 * clip_image_f32_init(void);
|
|
|
|
struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
|
2023-12-30 23:24:42 +02:00
|
|
|
|
2025-04-10 22:57:16 +02:00
|
|
|
// nx, ny are the output image dimensions
|
2025-05-13 17:07:21 +02:00
|
|
|
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
2025-04-10 22:57:16 +02:00
|
|
|
|
2025-05-13 17:07:21 +02:00
|
|
|
void clip_image_size_free (struct clip_image_size * img_size);
|
|
|
|
void clip_image_u8_free (struct clip_image_u8 * img);
|
|
|
|
void clip_image_f32_free(struct clip_image_f32 * img);
|
|
|
|
void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
|
|
|
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
2023-12-30 23:24:42 +02:00
|
|
|
|
2025-04-11 12:09:39 +02:00
|
|
|
// use for accessing underlay data of clip_image_f32_batch
|
2025-05-13 17:07:21 +02:00
|
|
|
size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
|
|
|
|
size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
|
|
|
|
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
|
|
|
|
struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
2025-04-11 12:09:39 +02:00
|
|
|
|
2025-02-26 22:26:52 +08:00
|
|
|
/**
|
|
|
|
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
|
|
|
|
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
|
|
|
|
*/
|
2025-05-13 17:07:21 +02:00
|
|
|
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
2025-02-22 22:28:28 +08:00
|
|
|
|
2025-05-13 17:07:21 +02:00
|
|
|
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
2023-12-30 23:24:42 +02:00
|
|
|
|
2023-11-06 22:36:23 +01:00
|
|
|
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
2025-05-13 17:07:21 +02:00
|
|
|
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
2023-11-06 22:36:23 +01:00
|
|
|
|
2024-05-28 11:48:16 +09:00
|
|
|
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
2025-05-13 17:07:21 +02:00
|
|
|
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-14 20:43:46 +08:00
|
|
|
|
2025-05-13 17:07:21 +02:00
|
|
|
struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
2024-08-09 18:33:53 +08:00
|
|
|
|
2025-05-13 17:07:21 +02:00
|
|
|
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
|
|
|
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
2025-02-02 15:48:46 +08:00
|
|
|
|
2025-05-13 17:07:21 +02:00
|
|
|
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
|
|
|
bool clip_is_glm(const struct clip_ctx * ctx);
|
|
|
|
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
|
|
|
bool clip_is_llava(const struct clip_ctx * ctx);
|
|
|
|
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
2023-10-12 18:23:18 +03:00
|
|
|
|
2025-05-13 17:07:21 +02:00
|
|
|
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
2025-05-22 20:42:48 +02:00
|
|
|
|
|
|
|
// use by audio input
|
|
|
|
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel);
|
|
|
|
|
|
|
|
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
|
|
|
|
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
|
2025-05-25 14:06:32 +02:00
|
|
|
bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
|