mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-26 11:45:21 +00:00
quantize: Handle user-defined quantization levels for additional tensors (#12511)
* Add llama_model_quantize_params parameters * Add new quantize parameters parsing and validation * Update usage * Add new parameters defaults * Add new quantization parameters logic * Add llama_model_quantize_params parameters * Add new quantize parameters parsing and validation * Update usage * Add new parameters defaults * Add new quantization parameters logic * Minor refactoring as per the contributors' coding guidelines * Update descriptions to match existing style * Add llama_model_quantize_params parameters * Add new quantize parameters parsing and validation * Update usage * Add new parameters defaults * Add new quantization parameters logic * Minor refactoring as per the contributors' guidelines * Implement general --tensor-type instead of tensor-specific command option * Fix implied type bug * Restore missing #includes * Add regex capability for tensor selection * Refactor function name and update ALLOWED_TENSOR_TYPE * Add missing #include * Handle edge case when tensor name is cls.output * Minor logging improvement
This commit is contained in:
@ -367,17 +367,18 @@ extern "C" {
|
||||
|
||||
// model quantization parameters
|
||||
typedef struct llama_model_quantize_params {
|
||||
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
enum ggml_type output_tensor_type; // output tensor type
|
||||
enum ggml_type token_embedding_type; // token embeddings tensor type
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // quantize all tensors to the default type
|
||||
bool keep_split; // quantize to the same number of shards
|
||||
void * imatrix; // pointer to importance matrix data
|
||||
void * kv_overrides; // pointer to vector containing overrides
|
||||
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
enum ggml_type output_tensor_type; // output tensor type
|
||||
enum ggml_type token_embedding_type; // token embeddings tensor type
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // quantize all tensors to the default type
|
||||
bool keep_split; // quantize to the same number of shards
|
||||
void * imatrix; // pointer to importance matrix data
|
||||
void * kv_overrides; // pointer to vector containing overrides
|
||||
void * tensor_types; // pointer to vector containing tensor types
|
||||
} llama_model_quantize_params;
|
||||
|
||||
typedef struct llama_logit_bias {
|
||||
|
Reference in New Issue
Block a user