llama.cpp/include/llama-cpp.h

#pragma once

#ifndef __cplusplus
#error "This header is for C++ only"
#endif

#include <memory>

#include "llama.h"

struct llama_model_deleter {
    void operator()(llama_model * model) { llama_model_free(model); }
};

struct llama_context_deleter {
    void operator()(llama_context * context) { llama_free(context); }
};

struct llama_sampler_deleter {
    void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
};

struct llama_adapter_lora_deleter {
    void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
};

struct llama_batch_ext_deleter {
    void operator()(llama_batch_ext * batch) { llama_batch_ext_free(batch); }
};

typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;

struct llama_batch_ext_ptr : std::unique_ptr<llama_batch_ext, llama_batch_ext_deleter> {
    llama_batch_ext_ptr() : std::unique_ptr<llama_batch_ext, llama_batch_ext_deleter>() {}
    llama_batch_ext_ptr(llama_batch_ext * batch) : std::unique_ptr<llama_batch_ext, llama_batch_ext_deleter>(batch) {}

    // convenience function to create a batch from text tokens, without worrying about manually freeing it
    static llama_batch_ext_ptr init_from_text(llama_token * tokens,
                                             int32_t   n_tokens,
                                             int32_t   pos0,
                                             int32_t   seq_id,
                                                bool   output_last) {
        return llama_batch_ext_ptr(llama_batch_ext_init_from_text(tokens, n_tokens, pos0, seq_id, output_last));
    }

    // convenience function to create a batch from text embeddings, without worrying about manually freeing it
    static llama_batch_ext_ptr init_from_embd(float * embd,
                                        size_t   n_tokens,
                                        size_t   n_embd,
                                       int32_t   pos0,
                                       int32_t   seq_id) {
        return llama_batch_ext_ptr(llama_batch_ext_init_from_embd(embd, n_tokens, n_embd, pos0, seq_id));
    }
};
Introduce llama-run (#10291) It's like simple-chat but it uses smart pointers to avoid manual memory cleanups. Less memory leaks in the code now. Avoid printing multiple dots. Split code into smaller functions. Uses no exception handling. Signed-off-by: Eric Curtin <ecurtin@redhat.com> 2024-11-25 16:56:24 -05:00			`#pragma once`

			`#ifndef __cplusplus`
			`#error "This header is for C++ only"`
			`#endif`

			`#include <memory>`

			`#include "llama.h"`

			`struct llama_model_deleter {`
llama : update llama_model API names (#11063) * llama : deprecate llama_free_model, add llama_model_free ggml-ci * llama : change `llama_load_model_from_file` -> `llama_model_load_from_file` ggml-ci 2025-01-06 10:55:18 +02:00			`void operator()(llama_model * model) { llama_model_free(model); }`
Introduce llama-run (#10291) It's like simple-chat but it uses smart pointers to avoid manual memory cleanups. Less memory leaks in the code now. Avoid printing multiple dots. Split code into smaller functions. Uses no exception handling. Signed-off-by: Eric Curtin <ecurtin@redhat.com> 2024-11-25 16:56:24 -05:00			`};`

			`struct llama_context_deleter {`
			`void operator()(llama_context * context) { llama_free(context); }`
			`};`

			`struct llama_sampler_deleter {`
			`void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }`
			`};`

llama : add `llama_vocab`, functions -> methods, naming (#11110) * llama : functions -> methods (#11110) * llama : add struct llama_vocab to the API (#11156) ggml-ci * hparams : move vocab params to llama_vocab (#11159) ggml-ci * vocab : more pimpl (#11165) ggml-ci * vocab : minor tokenization optimizations (#11160) ggml-ci Co-authored-by: Diego Devesa <slarengh@gmail.com> * lora : update API names (#11167) ggml-ci * llama : update API names to use correct prefix (#11174) * llama : update API names to use correct prefix ggml-ci * cont ggml-ci * cont ggml-ci * minor [no ci] * vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (#11174) ggml-ci * vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174) ggml-ci --------- Co-authored-by: Diego Devesa <slarengh@gmail.com> 2025-01-12 11:32:42 +02:00			`struct llama_adapter_lora_deleter {`
			`void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }`
llama : refactor `src/llama.cpp` (#10902) * llama : scatter llama.cpp into multiple modules (wip) * llama : control-vector -> adapter * llama : arch * llama : mmap ggml-ci * ci : remove BUILD_SHARED_LIBS=OFF ggml-ci * llama : arch (cont) ggml-ci * llama : chat ggml-ci * llama : model ggml-ci * llama : hparams ggml-ci * llama : adapter ggml-ci * examples : fix ggml-ci * rebase ggml-ci * minor * llama : kv cache ggml-ci * llama : impl ggml-ci * llama : batch ggml-ci * cont ggml-ci * llama : context ggml-ci * minor * llama : context (cont) ggml-ci * llama : model loader ggml-ci * common : update lora ggml-ci * llama : quant ggml-ci * llama : quant (cont) ggml-ci * minor [no ci] 2025-01-03 10:18:53 +02:00			`};`

move to llama_batch_ext 2025-02-16 00:02:53 +01:00			`struct llama_batch_ext_deleter {`
			`void operator()(llama_batch_ext * batch) { llama_batch_ext_free(batch); }`
rework, targeting llama-server 2025-02-14 18:16:49 +01:00			`};`

Introduce llama-run (#10291) It's like simple-chat but it uses smart pointers to avoid manual memory cleanups. Less memory leaks in the code now. Avoid printing multiple dots. Split code into smaller functions. Uses no exception handling. Signed-off-by: Eric Curtin <ecurtin@redhat.com> 2024-11-25 16:56:24 -05:00			`typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;`
			`typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;`
			`typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;`
llama : add `llama_vocab`, functions -> methods, naming (#11110) * llama : functions -> methods (#11110) * llama : add struct llama_vocab to the API (#11156) ggml-ci * hparams : move vocab params to llama_vocab (#11159) ggml-ci * vocab : more pimpl (#11165) ggml-ci * vocab : minor tokenization optimizations (#11160) ggml-ci Co-authored-by: Diego Devesa <slarengh@gmail.com> * lora : update API names (#11167) ggml-ci * llama : update API names to use correct prefix (#11174) * llama : update API names to use correct prefix ggml-ci * cont ggml-ci * cont ggml-ci * minor [no ci] * vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (#11174) ggml-ci * vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174) ggml-ci --------- Co-authored-by: Diego Devesa <slarengh@gmail.com> 2025-01-12 11:32:42 +02:00			`typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;`
llama_batch_ext_ptr::from_text/embd 2025-03-14 17:12:03 +01:00
			`struct llama_batch_ext_ptr : std::unique_ptr<llama_batch_ext, llama_batch_ext_deleter> {`
fix compile 2025-03-14 22:30:29 +01:00			`llama_batch_ext_ptr() : std::unique_ptr<llama_batch_ext, llama_batch_ext_deleter>() {}`
llama_batch_ext_ptr::from_text/embd 2025-03-14 17:12:03 +01:00			`llama_batch_ext_ptr(llama_batch_ext * batch) : std::unique_ptr<llama_batch_ext, llama_batch_ext_deleter>(batch) {}`

rename to init_from_text 2025-03-14 22:17:07 +01:00			`// convenience function to create a batch from text tokens, without worrying about manually freeing it`
			`static llama_batch_ext_ptr init_from_text(llama_token * tokens,`
llama_batch_ext_ptr::from_text/embd 2025-03-14 17:12:03 +01:00			`int32_t n_tokens,`
			`int32_t pos0,`
			`int32_t seq_id,`
			`bool output_last) {`
			`return llama_batch_ext_ptr(llama_batch_ext_init_from_text(tokens, n_tokens, pos0, seq_id, output_last));`
			`}`

rename to init_from_text 2025-03-14 22:17:07 +01:00			`// convenience function to create a batch from text embeddings, without worrying about manually freeing it`
			`static llama_batch_ext_ptr init_from_embd(float * embd,`
llama_batch_ext_ptr::from_text/embd 2025-03-14 17:12:03 +01:00			`size_t n_tokens,`
			`size_t n_embd,`
			`int32_t pos0,`
			`int32_t seq_id) {`
			`return llama_batch_ext_ptr(llama_batch_ext_init_from_embd(embd, n_tokens, n_embd, pos0, seq_id));`
			`}`
			`};`