2024-09-09 23:36:09 +02:00
# include "arg.h"
2024-01-12 06:59:57 +01:00
# include "common.h"
2024-09-15 20:46:12 +03:00
# include "log.h"
2024-01-12 06:59:57 +01:00
# include "llama.h"
2025-02-09 12:06:15 -05:00
# include "gguf.h"
2024-01-12 06:59:57 +01:00
2025-06-23 11:50:54 -04:00
# include <algorithm>
2025-02-12 20:36:11 +00:00
# include <chrono>
2024-01-12 06:59:57 +01:00
# include <cmath>
# include <cstdio>
# include <cstring>
# include <ctime>
# include <thread>
# include <mutex>
# include <vector>
2025-04-15 17:29:57 -04:00
# include <fstream>
2024-01-12 06:59:57 +01:00
# include <unordered_map>
2024-09-10 11:31:49 -04:00
# include <map>
2024-01-12 06:59:57 +01:00
# if defined(_MSC_VER)
# pragma warning(disable: 4244 4267) // possible loss of data
# endif
2024-09-07 20:43:51 +02:00
static void print_usage ( int , char * * argv ) {
2024-09-15 20:46:12 +03:00
LOG ( " \n example usage: \n " ) ;
LOG ( " \n %s \\ \n "
2025-02-09 12:06:15 -05:00
" -m model.gguf -f some-text.txt [-o imatrix.gguf] [--process-output] \\ \n "
2024-06-06 16:30:58 +03:00
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\ \n "
2025-06-18 16:32:35 -04:00
" [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] \\ \n "
2025-05-09 05:53:58 -04:00
" [--parse-special] \n " , argv [ 0 ] ) ;
2024-09-15 20:46:12 +03:00
LOG ( " \n " ) ;
2024-06-06 16:30:58 +03:00
}
2025-04-15 17:29:57 -04:00
static bool str_has_suffix ( const std : : string & str , const std : : string & suffix ) {
return str . size ( ) > = suffix . size ( ) & & str . compare ( str . size ( ) - suffix . size ( ) , str . size ( ) , suffix ) = = 0 ;
}
2024-09-10 11:31:49 -04:00
static bool str_remove_suffix ( std : : string & str , const std : : string & suffix ) {
2025-04-15 17:29:57 -04:00
bool has_suffix = str_has_suffix ( str , suffix ) ;
2024-09-10 11:31:49 -04:00
if ( has_suffix ) {
str = str . substr ( 0 , str . size ( ) - suffix . size ( ) ) ;
}
return has_suffix ;
}
2025-04-15 17:29:57 -04:00
static const char * const LLM_KV_IMATRIX_DATASETS = " imatrix.datasets " ;
2024-09-06 17:17:25 -04:00
static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = " imatrix.chunk_count " ;
static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = " imatrix.chunk_size " ;
2024-01-12 06:59:57 +01:00
struct Stats {
2024-09-08 10:04:01 -04:00
std : : vector < float > values ;
2024-09-06 17:17:25 -04:00
std : : vector < int64_t > counts ;
2024-01-12 06:59:57 +01:00
} ;
class IMatrixCollector {
public :
IMatrixCollector ( ) = default ;
2024-10-10 22:57:42 +02:00
void set_params ( common_params params ) { m_params = std : : move ( params ) ; }
2024-01-17 18:46:30 +02:00
bool collect_imatrix ( struct ggml_tensor * t , bool ask , void * user_data ) ;
2025-04-15 17:29:57 -04:00
void save_imatrix_legacy ( int32_t ncall = - 1 ) const ;
2024-09-06 17:17:25 -04:00
void save_imatrix ( int32_t n_chunk = - 1 ) const ;
2025-04-15 17:29:57 -04:00
bool load_imatrix_legacy ( const char * fname ) ;
2024-06-06 16:30:58 +03:00
bool load_imatrix ( const char * file_name ) ;
2024-01-12 06:59:57 +01:00
private :
std : : unordered_map < std : : string , Stats > m_stats ;
2024-10-10 22:57:42 +02:00
common_params m_params ;
2024-01-12 06:59:57 +01:00
std : : mutex m_mutex ;
2025-04-15 17:29:57 -04:00
std : : vector < std : : string > m_datasets ;
2024-09-06 17:17:25 -04:00
int32_t m_last_chunk = 0 ;
2025-05-04 00:50:37 +02:00
std : : vector < char > m_src1_data ;
2024-04-18 15:18:48 +02:00
std : : vector < char > m_ids ; // the expert ids from ggml_mul_mat_id
2024-01-12 06:59:57 +01:00
} ;
2024-03-24 16:18:45 +02:00
// remove any prefix and suffixes from the name
// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
static std : : string filter_tensor_name ( const char * name ) {
std : : string wname ;
const char * p = strchr ( name , ' # ' ) ;
if ( p ! = NULL ) {
p = p + 1 ;
const char * q = strchr ( p , ' # ' ) ;
if ( q ! = NULL ) {
wname = std : : string ( p , q - p ) ;
} else {
wname = p ;
}
} else {
wname = name ;
}
return wname ;
}
2024-01-17 18:46:30 +02:00
bool IMatrixCollector : : collect_imatrix ( struct ggml_tensor * t , bool ask , void * user_data ) {
GGML_UNUSED ( user_data ) ;
const struct ggml_tensor * src0 = t - > src [ 0 ] ;
const struct ggml_tensor * src1 = t - > src [ 1 ] ;
2024-03-24 16:18:45 +02:00
std : : string wname = filter_tensor_name ( src0 - > name ) ;
2024-03-18 11:03:04 +01:00
2025-04-15 17:29:57 -04:00
const int32_t chunk_size = m_params . n_ctx / m_params . n_parallel ;
2024-01-17 18:46:30 +02:00
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
if ( ask ) {
if ( t - > op = = GGML_OP_MUL_MAT_ID ) return true ; // collect all indirect matrix multiplications
if ( t - > op ! = GGML_OP_MUL_MAT ) return false ;
2024-04-18 15:18:48 +02:00
// why are small batches ignored (<16 tokens)?
2024-01-17 18:46:30 +02:00
if ( src1 - > ne [ 1 ] < 16 | | src1 - > type ! = GGML_TYPE_F32 ) return false ;
2024-06-06 16:30:58 +03:00
if ( ! ( wname . substr ( 0 , 4 ) = = " blk. " | | ( m_params . process_output & & wname = = " output.weight " ) ) ) return false ;
2024-01-17 18:46:30 +02:00
return true ;
2024-01-12 06:59:57 +01:00
}
2024-01-17 18:46:30 +02:00
std : : lock_guard < std : : mutex > lock ( m_mutex ) ;
// copy the data from the GPU memory if needed
const bool is_host = ggml_backend_buffer_is_host ( src1 - > buffer ) ;
if ( ! is_host ) {
2025-05-04 00:50:37 +02:00
const size_t src1_nbytes = ggml_nbytes ( src1 ) ;
m_src1_data . resize ( src1_nbytes ) ;
ggml_backend_tensor_get ( src1 , m_src1_data . data ( ) , 0 , src1_nbytes ) ;
2024-01-12 06:59:57 +01:00
}
2024-01-17 18:46:30 +02:00
2025-05-04 00:50:37 +02:00
const char * data = is_host ? ( const char * ) src1 - > data : m_src1_data . data ( ) ;
GGML_ASSERT ( src1 - > nb [ 0 ] = = ggml_element_size ( src1 ) ) ;
2024-01-17 18:46:30 +02:00
2025-06-23 11:50:54 -04:00
// TODO: 4d? (is that even used in practice?)
// the extra dimension would need to be stored somewhere to be reflected in the imatrix file
if ( ggml_nrows ( src1 ) ! = src1 - > ne [ 1 ] * src1 - > ne [ 2 ] ) {
LOG_ERR ( " %s: tensor has more than 3 dimensions: %s " , __func__ , wname . c_str ( ) ) ;
GGML_ASSERT ( false ) ;
}
2024-04-03 15:07:05 +02:00
// this has been adapted to the new format of storing merged experts in a single 3d tensor
2025-02-15 16:40:57 +02:00
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
2024-01-17 18:46:30 +02:00
if ( t - > op = = GGML_OP_MUL_MAT_ID ) {
2024-04-18 15:18:48 +02:00
// ids -> [n_experts_used, n_tokens]
// src1 -> [cols, n_expert_used, n_tokens]
2024-04-03 15:07:05 +02:00
const ggml_tensor * ids = t - > src [ 2 ] ;
2025-06-23 11:50:54 -04:00
const int64_t n_as = src0 - > ne [ 2 ] ;
const int64_t n_ids = ids - > ne [ 0 ] ;
2024-01-17 18:46:30 +02:00
2024-04-03 15:07:05 +02:00
// the top-k selected expert ids are stored in the ids tensor
// for simplicity, always copy ids to host, because it is small
2024-04-18 15:18:48 +02:00
// take into account that ids is not contiguous!
GGML_ASSERT ( ids - > ne [ 1 ] = = src1 - > ne [ 2 ] ) ;
m_ids . resize ( ggml_nbytes ( ids ) ) ;
2024-04-03 15:07:05 +02:00
ggml_backend_tensor_get ( ids , m_ids . data ( ) , 0 , ggml_nbytes ( ids ) ) ;
auto & e = m_stats [ wname ] ;
2024-09-06 17:17:25 -04:00
if ( e . counts . size ( ) = = 1 & & n_as > 1 ) {
// broadcast, when loading an old imatrix
e . counts . resize ( n_as , e . counts [ 0 ] ) ;
}
2024-04-18 15:18:48 +02:00
if ( e . values . empty ( ) ) {
e . values . resize ( src1 - > ne [ 0 ] * n_as , 0 ) ;
2024-09-06 17:17:25 -04:00
e . counts . resize ( n_as , 0 ) ;
2024-04-18 15:18:48 +02:00
}
else if ( e . values . size ( ) ! = ( size_t ) src1 - > ne [ 0 ] * n_as ) {
2025-06-23 11:50:54 -04:00
LOG_ERR ( " %s: inconsistent size for %s (%d vs %d) \n " , __func__ , wname . c_str ( ) , ( int ) e . values . size ( ) , ( int ) ( src1 - > ne [ 0 ] * n_as ) ) ;
2024-07-27 04:41:55 +02:00
exit ( 1 ) ; //GGML_ABORT("fatal error");
2024-04-18 15:18:48 +02:00
}
2024-09-06 17:17:25 -04:00
else if ( e . counts . size ( ) ! = ( size_t ) n_as ) {
2025-04-15 17:48:06 -04:00
LOG_ERR ( " %s: inconsistent expert count for %s (%d vs %d) \n " , __func__ , wname . c_str ( ) , ( int ) e . counts . size ( ) , ( int ) n_as ) ;
2024-09-06 17:17:25 -04:00
exit ( 1 ) ; //GGML_ABORT("fatal error");
}
2025-02-09 12:06:15 -05:00
LOG_DBGV ( 2 , " %s[%d]: %32s, %s, %5d x %5d, %d \n " , __func__ , m_last_chunk , wname . c_str ( ) , ggml_op_name ( t - > op ) , ( int ) src1 - > ne [ 0 ] , ( int ) src1 - > ne [ 2 ] , ( int ) src1 - > type ) ;
2024-01-17 18:46:30 +02:00
// loop over all possible experts, regardless if they are used or not in the batch
2025-06-23 11:50:54 -04:00
for ( int64_t ex = 0 ; ex < n_as ; + + ex ) {
2024-04-03 15:07:05 +02:00
size_t e_start = ex * src1 - > ne [ 0 ] ;
2024-04-18 15:18:48 +02:00
2025-06-23 11:50:54 -04:00
for ( int64_t idx = 0 ; idx < n_ids ; + + idx ) {
for ( int64_t row = 0 ; row < src1 - > ne [ 2 ] ; + + row ) {
2024-04-18 15:18:48 +02:00
const int excur = * ( const int32_t * ) ( m_ids . data ( ) + row * ids - > nb [ 1 ] + idx * ids - > nb [ 0 ] ) ;
GGML_ASSERT ( excur > = 0 & & excur < n_as ) ; // sanity check
if ( excur ! = ex ) continue ;
const int64_t i11 = idx % src1 - > ne [ 1 ] ;
const int64_t i12 = row ;
2025-05-04 00:50:37 +02:00
const float * x = ( const float * ) ( data + i11 * src1 - > nb [ 1 ] + i12 * src1 - > nb [ 2 ] ) ;
2024-04-18 15:18:48 +02:00
2024-09-06 17:17:25 -04:00
e . counts [ ex ] + + ;
2025-06-23 11:50:54 -04:00
for ( int64_t j = 0 ; j < src1 - > ne [ 0 ] ; + + j ) {
2025-06-18 16:33:37 -04:00
e . values [ e_start + j ] + = x [ j ] * x [ j ] ;
2024-09-06 17:17:25 -04:00
if ( ! std : : isfinite ( ( float ) e . values [ e_start + j ] ) ) {
2025-02-09 12:06:15 -05:00
LOG_ERR ( " %f detected in %s \n " , ( float ) e . values [ e_start + j ] , wname . c_str ( ) ) ;
2024-06-07 08:01:29 +02:00
exit ( 1 ) ;
}
2024-04-18 15:18:48 +02:00
}
2024-01-17 18:46:30 +02:00
}
}
2025-04-15 17:29:57 -04:00
const int32_t n_chunk = e . counts [ ex ] / chunk_size ;
2024-09-06 17:17:25 -04:00
if ( n_chunk > m_last_chunk ) {
const int32_t chunk_step = n_chunk - m_last_chunk ;
m_last_chunk = n_chunk ;
if ( ( m_last_chunk % m_params . n_out_freq ) / chunk_step = = 0 ) {
2024-01-17 18:46:30 +02:00
save_imatrix ( ) ;
}
2024-09-06 17:17:25 -04:00
if ( m_params . n_save_freq > 0 & & ( m_last_chunk % m_params . n_save_freq ) / chunk_step = = 0 ) {
save_imatrix ( m_last_chunk ) ;
2024-01-22 14:18:43 +02:00
}
2024-01-17 18:46:30 +02:00
}
2024-01-12 06:59:57 +01:00
}
2024-01-17 18:46:30 +02:00
} else {
2024-06-06 16:30:58 +03:00
auto & e = m_stats [ wname ] ;
2025-06-23 11:50:54 -04:00
const int64_t n_mat = src1 - > ne [ 2 ] * src1 - > ne [ 3 ] ;
2024-01-17 18:46:30 +02:00
if ( e . values . empty ( ) ) {
2025-06-23 11:50:54 -04:00
e . values . resize ( src1 - > ne [ 0 ] * n_mat , 0 ) ;
e . counts . resize ( n_mat , 0 ) ;
2024-01-17 18:46:30 +02:00
}
2025-06-23 11:50:54 -04:00
else if ( e . values . size ( ) ! = ( size_t ) ( src1 - > ne [ 0 ] * n_mat ) ) {
LOG_ERR ( " %s: inconsistent size for %s (%d vs %d) \n " , __func__ , wname . c_str ( ) , ( int ) e . values . size ( ) , ( int ) ( src1 - > ne [ 0 ] * n_mat ) ) ;
2024-07-27 04:41:55 +02:00
exit ( 1 ) ; //GGML_ABORT("fatal error");
2024-01-17 18:46:30 +02:00
}
2025-06-23 11:50:54 -04:00
else if ( e . counts . size ( ) ! = ( size_t ) n_mat ) {
LOG_ERR ( " %s: inconsistent expert count for %s (%d vs %d) \n " , __func__ , wname . c_str ( ) , ( int ) e . counts . size ( ) , ( int ) n_mat ) ;
2024-09-06 17:17:25 -04:00
exit ( 1 ) ; //GGML_ABORT("fatal error");
}
2025-06-23 11:50:54 -04:00
LOG_DBGV ( 2 , " %s[%d]: %32s, %s, %5d x %5d x %5d, %d \n " , __func__ , m_last_chunk , wname . c_str ( ) , ggml_op_name ( t - > op ) , ( int ) src1 - > ne [ 0 ] , ( int ) src1 - > ne [ 1 ] , ( int ) src1 - > ne [ 2 ] , ( int ) src1 - > type ) ;
for ( int64_t i3 = 0 ; i3 < src1 - > ne [ 3 ] ; + + i3 ) {
for ( int64_t i2 = 0 ; i2 < src1 - > ne [ 2 ] ; + + i2 ) {
const int64_t mat_id = i3 * src1 - > ne [ 2 ] + i2 ;
const int64_t mat_start = mat_id * src1 - > ne [ 0 ] ;
for ( int64_t row = 0 ; row < src1 - > ne [ 1 ] ; + + row ) {
const float * x = ( const float * ) ( data + row * src1 - > nb [ 1 ] + i2 * src1 - > nb [ 2 ] + i3 * src1 - > ne [ 3 ] ) ;
e . counts [ mat_id ] + + ;
for ( int64_t j = 0 ; j < src1 - > ne [ 0 ] ; + + j ) {
e . values [ mat_start + j ] + = x [ j ] * x [ j ] ;
if ( ! std : : isfinite ( ( float ) e . values [ j ] ) ) {
LOG_ERR ( " %f detected in %s \n " , ( float ) e . values [ j ] , wname . c_str ( ) ) ;
exit ( 1 ) ;
}
}
}
const int32_t n_chunk = e . counts [ mat_id ] / chunk_size ;
if ( n_chunk > m_last_chunk ) {
const int32_t chunk_step = n_chunk - m_last_chunk ;
m_last_chunk = n_chunk ;
if ( ( m_last_chunk % m_params . n_out_freq ) / chunk_step = = 0 ) {
save_imatrix ( ) ;
}
if ( m_params . n_save_freq > 0 & & ( m_last_chunk % m_params . n_save_freq ) / chunk_step = = 0 ) {
save_imatrix ( m_last_chunk ) ;
}
2024-06-07 08:01:29 +02:00
}
2024-01-22 14:18:43 +02:00
}
2024-01-12 06:59:57 +01:00
}
}
2024-01-17 18:46:30 +02:00
return true ;
2024-01-12 06:59:57 +01:00
}
2025-04-15 17:29:57 -04:00
void IMatrixCollector : : save_imatrix_legacy ( int32_t ncall ) const {
2024-06-06 16:30:58 +03:00
auto fname = m_params . out_file ;
2024-01-22 14:18:43 +02:00
2025-04-15 17:29:57 -04:00
if ( ncall > 0 ) {
2024-06-06 16:30:58 +03:00
fname + = " .at_ " ;
2025-04-15 17:29:57 -04:00
fname + = std : : to_string ( ncall ) ;
2024-06-06 16:30:58 +03:00
}
2024-01-22 14:18:43 +02:00
2024-06-09 20:19:35 +03:00
// avoid writing imatrix entries that do not have full data
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
2025-04-15 17:29:57 -04:00
int n_entries = 0 ;
2024-06-09 20:19:35 +03:00
std : : vector < std : : string > to_store ;
bool is_first = true ; // for printing
for ( const auto & kv : m_stats ) {
const int n_all = kv . second . counts . size ( ) ;
if ( n_all = = 0 ) {
continue ;
}
int n_zeros = 0 ;
for ( const int c : kv . second . counts ) {
if ( c = = 0 ) {
n_zeros + + ;
}
}
if ( n_zeros ! = 0 & & is_first ) {
2024-09-15 20:46:12 +03:00
LOG_INF ( " \n " ) ;
2024-06-09 20:19:35 +03:00
is_first = false ;
}
if ( n_zeros = = n_all ) {
2024-09-15 20:46:12 +03:00
LOG_WRN ( " %s: entry '%40s' has no data - skipping \n " , __func__ , kv . first . c_str ( ) ) ;
2024-06-09 20:19:35 +03:00
continue ;
}
if ( n_zeros > 0 ) {
2024-09-15 20:46:12 +03:00
LOG_WRN ( " %s: entry '%40s' has partial data (%.2f%%) - skipping \n " , __func__ , kv . first . c_str ( ) , 100.0f * ( n_all - n_zeros ) / n_all ) ;
2024-06-09 20:19:35 +03:00
continue ;
}
2025-04-15 17:29:57 -04:00
n_entries + + ;
2024-06-09 20:19:35 +03:00
to_store . push_back ( kv . first ) ;
}
if ( to_store . size ( ) < m_stats . size ( ) ) {
2024-09-15 20:46:12 +03:00
LOG_WRN ( " %s: storing only %zu out of %zu entries \n " , __func__ , to_store . size ( ) , m_stats . size ( ) ) ;
2024-06-09 20:19:35 +03:00
}
2024-09-08 11:03:59 -04:00
// deterministic tensor name order
std : : sort ( to_store . begin ( ) , to_store . end ( ) ) ;
2025-04-15 17:29:57 -04:00
const int32_t chunk_size = m_params . n_ctx / m_params . n_parallel ;
std : : ofstream out ( fname , std : : ios : : binary ) ;
out . write ( ( const char * ) & n_entries , sizeof ( n_entries ) ) ;
for ( const auto & name : to_store ) {
const auto & stat = m_stats . at ( name ) ;
const int32_t len = name . size ( ) ;
out . write ( ( const char * ) & len , sizeof ( len ) ) ;
out . write ( name . c_str ( ) , len ) ;
const int32_t ncall = * std : : max_element ( stat . counts . begin ( ) , stat . counts . end ( ) ) / chunk_size ;
out . write ( ( const char * ) & ncall , sizeof ( ncall ) ) ;
const int32_t nval = stat . values . size ( ) ;
const int32_t nmat = stat . counts . size ( ) ;
out . write ( ( const char * ) & nval , sizeof ( nval ) ) ;
if ( nval > 0 & & nmat > 0 ) {
std : : vector < float > tmp ( nval ) ;
for ( int32_t i = 0 ; i < nval ; i + + ) {
const float counts = static_cast < float > ( stat . counts [ i / ( nval / nmat ) ] ) ;
tmp [ i ] = ( stat . values [ i ] / counts ) * static_cast < float > ( ncall ) ;
}
out . write ( ( const char * ) tmp . data ( ) , nval * sizeof ( float ) ) ;
}
}
// Write the number of call the matrix was computed with
out . write ( ( const char * ) & m_last_chunk , sizeof ( m_last_chunk ) ) ;
// Write the input filename at the end of the file to later on specify it in quantize
{
const char * dataset_file = m_params . prompt_file . c_str ( ) ;
int32_t len = m_params . prompt_file . size ( ) ;
// When there is no prompt but there were other imatrix files loaded, use the last dataset
if ( m_params . prompt_file . empty ( ) & & ! m_datasets . empty ( ) ) {
const std : : string & dataset_str = m_datasets [ m_datasets . size ( ) - 1 ] ;
dataset_file = dataset_str . c_str ( ) ;
len = dataset_str . size ( ) ;
}
out . write ( ( const char * ) & len , sizeof ( len ) ) ;
out . write ( dataset_file , len ) ;
}
LOGV ( 1 , " \n " ) ;
LOG_DBGV ( 1 , " %s: stored collected data after %d chunks in %s \n " , __func__ , m_last_chunk , fname . c_str ( ) ) ;
}
void IMatrixCollector : : save_imatrix ( int32_t n_chunk ) const {
auto fname = m_params . out_file ;
// TODO: use the new format by default also for .imatrix
if ( ! str_has_suffix ( fname , " .gguf " ) ) {
2025-06-18 16:44:41 -04:00
this - > save_imatrix_legacy ( n_chunk ) ;
return ;
2025-04-15 17:29:57 -04:00
}
if ( n_chunk > 0 ) {
fname + = " .at_ " ;
fname + = std : : to_string ( n_chunk ) ;
}
// write imatrix entries even if they don't have full data. (can be corrected when reading)
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
std : : vector < std : : string > to_store ;
size_t data_size = 0 ;
for ( const auto & kv : m_stats ) {
to_store . push_back ( kv . first ) ;
data_size + = GGML_PAD ( ggml_tensor_overhead ( ) + sizeof ( float ) * kv . second . values . size ( ) , GGML_MEM_ALIGN ) ;
data_size + = GGML_PAD ( ggml_tensor_overhead ( ) + sizeof ( float ) * kv . second . counts . size ( ) , GGML_MEM_ALIGN ) ;
}
// deterministic tensor name order
std : : sort ( to_store . begin ( ) , to_store . end ( ) ) ;
2024-09-06 17:17:25 -04:00
struct ggml_init_params params = {
2024-09-09 22:33:10 -04:00
/* .mem_size = */ data_size ,
/* .mem_buffer = */ NULL ,
/* .no_alloc = */ false ,
2024-09-06 17:17:25 -04:00
} ;
struct ggml_context * ctx = ggml_init ( params ) ;
struct gguf_context * ctx_gguf = gguf_init_empty ( ) ;
2025-04-15 17:29:57 -04:00
{
std : : vector < const char * > datasets ;
datasets . reserve ( m_datasets . size ( ) + 1 ) ;
for ( size_t i = 0 ; i < m_datasets . size ( ) ; + + i ) {
datasets . push_back ( m_datasets [ i ] . c_str ( ) ) ;
}
if ( ! m_params . prompt_file . empty ( ) ) {
datasets . push_back ( m_params . prompt_file . c_str ( ) ) ;
}
gguf_set_val_str ( ctx_gguf , " general.type " , " imatrix " ) ;
// Write the dataset paths
gguf_set_arr_str ( ctx_gguf , LLM_KV_IMATRIX_DATASETS , datasets . data ( ) , datasets . size ( ) ) ;
// Write the number of chunks the matrix was computed with
gguf_set_val_u32 ( ctx_gguf , LLM_KV_IMATRIX_CHUNK_COUNT , m_last_chunk ) ;
gguf_set_val_u32 ( ctx_gguf , LLM_KV_IMATRIX_CHUNK_SIZE , m_params . n_ctx / m_params . n_parallel ) ;
}
2024-09-06 17:17:25 -04:00
2024-06-09 20:19:35 +03:00
for ( const auto & name : to_store ) {
const auto & stat = m_stats . at ( name ) ;
2024-09-06 17:17:25 -04:00
const int32_t nval = ( int32_t ) stat . values . size ( ) ;
const int32_t nmat = ( int32_t ) stat . counts . size ( ) ;
2025-04-15 17:29:57 -04:00
if ( nval > 0 & & nmat > 0 ) {
struct ggml_tensor * in_sum2 = ggml_new_tensor_2d ( ctx , GGML_TYPE_F32 , nval / nmat , nmat ) ;
struct ggml_tensor * counts = ggml_new_tensor_2d ( ctx , GGML_TYPE_F32 , 1 , nmat ) ;
ggml_format_name ( in_sum2 , " %s.in_sum2 " , name . c_str ( ) ) ;
2024-09-10 11:31:49 -04:00
ggml_format_name ( counts , " %s.counts " , name . c_str ( ) ) ;
2024-09-06 17:17:25 -04:00
for ( int32_t j = 0 ; j < nval ; + + j ) {
2025-04-15 17:29:57 -04:00
( ( float * ) in_sum2 - > data ) [ j ] = ( float ) stat . values [ j ] ;
2024-09-06 17:17:25 -04:00
}
for ( int32_t j = 0 ; j < nmat ; + + j ) {
( ( float * ) counts - > data ) [ j ] = ( float ) stat . counts [ j ] ;
2024-05-08 01:24:16 +01:00
}
2024-09-06 17:17:25 -04:00
2025-04-15 17:29:57 -04:00
gguf_add_tensor ( ctx_gguf , in_sum2 ) ;
2024-09-06 17:17:25 -04:00
gguf_add_tensor ( ctx_gguf , counts ) ;
2024-05-08 01:24:16 +01:00
}
2024-01-12 06:59:57 +01:00
}
2024-04-26 20:06:33 +02:00
2024-09-06 17:17:25 -04:00
gguf_write_to_file ( ctx_gguf , fname . c_str ( ) , false ) ;
2024-04-26 20:06:33 +02:00
2024-09-15 20:46:12 +03:00
LOGV ( 1 , " \n " ) ;
2025-02-09 12:06:15 -05:00
LOG_DBGV ( 1 , " %s: stored collected data after %d chunks in %s \n " , __func__ , m_last_chunk , fname . c_str ( ) ) ;
2024-09-06 17:17:25 -04:00
gguf_free ( ctx_gguf ) ;
ggml_free ( ctx ) ;
2024-01-12 06:59:57 +01:00
}
2025-04-15 17:29:57 -04:00
bool IMatrixCollector : : load_imatrix_legacy ( const char * fname ) {
std : : ifstream in ( fname , std : : ios : : binary ) ;
if ( ! in ) {
LOG_ERR ( " %s: failed to open %s \n " , __func__ , fname ) ;
return false ;
}
int n_entries ;
in . read ( ( char * ) & n_entries , sizeof ( n_entries ) ) ;
if ( in . fail ( ) | | n_entries < 1 ) {
LOG_ERR ( " %s: no data in file %s \n " , __func__ , fname ) ;
return false ;
}
// Guess the chunk size because it's not stored in the file
const int32_t chunk_size = m_params . n_ctx / m_params . n_parallel ;
for ( int i = 0 ; i < n_entries ; + + i ) {
int32_t len = 0 ;
in . read ( ( char * ) & len , sizeof ( len ) ) ;
std : : vector < char > name_as_vec ( len + 1 ) ;
in . read ( ( char * ) name_as_vec . data ( ) , len ) ;
if ( in . fail ( ) ) {
LOG_ERR ( " %s: failed reading name for entry %d from %s \n " , __func__ , i + 1 , fname ) ;
return false ;
}
name_as_vec [ len ] = 0 ;
std : : string name { name_as_vec . data ( ) } ;
auto & e = m_stats [ std : : move ( name ) ] ;
int32_t ncall = 0 ;
in . read ( ( char * ) & ncall , sizeof ( ncall ) ) ;
int32_t nval = 0 ;
in . read ( ( char * ) & nval , sizeof ( nval ) ) ;
if ( in . fail ( ) | | nval < 1 ) {
LOG_ERR ( " %s: failed reading number of values for entry %d \n " , __func__ , i ) ;
m_stats = { } ;
return false ;
}
if ( e . values . empty ( ) ) {
e . values . resize ( nval , 0.0f ) ;
e . counts . resize ( 1 , 0 ) ;
}
std : : vector < float > tmp ( nval ) ;
in . read ( ( char * ) tmp . data ( ) , nval * sizeof ( float ) ) ;
if ( in . fail ( ) ) {
LOG_ERR ( " %s: failed reading data for entry %d \n " , __func__ , i ) ;
m_stats = { } ;
return false ;
}
// Recreate the state as expected by save_imatrix(), and correct for weighted sum.
for ( int i = 0 ; i < nval ; i + + ) {
e . values [ i ] + = tmp [ i ] * chunk_size ;
}
// The legacy format doesn't distinguish the counts for different experts
for ( size_t j = 0 ; j < e . counts . size ( ) ; + + j ) {
e . counts [ j ] + = ncall * chunk_size ;
}
}
{
// TODO: extract into its own method; this is also used by the GGUF-based format
// Calculate the last chunk count
int64_t max_count = 0 ;
for ( const auto & stats : m_stats ) {
for ( int64_t count : stats . second . counts ) {
if ( count > max_count ) {
max_count = count ;
}
}
}
m_last_chunk = max_count / ( chunk_size ) ;
}
{
// Read the number of calls the matrix was computed with
int32_t n_calls ;
in . read ( ( char * ) & n_calls , sizeof ( n_calls ) ) ;
// ignore it because it's not important
}
// Read the dataset path to include it when writing to GGUF
if ( ! in . fail ( ) ) {
int32_t len = 0 ;
in . read ( ( char * ) & len , sizeof ( len ) ) ;
if ( ! in . fail ( ) ) {
std : : vector < char > dataset ;
dataset . resize ( len + 1 , 0 ) ;
in . read ( dataset . data ( ) , len ) ;
if ( ! in . fail ( ) ) {
m_datasets . push_back ( dataset . data ( ) ) ;
}
}
}
return true ;
}
// Using GGUF as the file format, for greater extensibility
2024-09-06 17:17:25 -04:00
bool IMatrixCollector : : load_imatrix ( const char * file_name ) {
struct ggml_context * ctx = nullptr ;
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ false , // the data is needed
/* .ctx = */ & ctx ,
} ;
struct gguf_context * ctx_gguf = gguf_init_from_file ( file_name , meta_gguf_params ) ;
if ( ! ctx_gguf ) {
2025-04-15 17:29:57 -04:00
return this - > load_imatrix_legacy ( file_name ) ;
2024-02-04 10:39:58 +02:00
}
2024-09-06 17:17:25 -04:00
const int32_t n_entries = gguf_get_n_tensors ( ctx_gguf ) ;
2024-09-10 11:31:49 -04:00
if ( n_entries < 1 ) {
2025-02-09 12:06:15 -05:00
LOG_ERR ( " %s: no data in file %s \n " , __func__ , file_name ) ;
2024-09-06 17:17:25 -04:00
gguf_free ( ctx_gguf ) ;
ggml_free ( ctx ) ;
2024-02-04 10:39:58 +02:00
return false ;
}
2024-09-06 17:17:25 -04:00
2025-04-15 17:29:57 -04:00
const int64_t datasets_key = gguf_find_key ( ctx_gguf , LLM_KV_IMATRIX_DATASETS ) ;
if ( datasets_key ! = - 1 & & gguf_get_arr_type ( ctx_gguf , datasets_key ) = = GGUF_TYPE_STRING ) {
const int64_t n = gguf_get_arr_n ( ctx_gguf , datasets_key ) ;
m_datasets . reserve ( m_datasets . size ( ) + n ) ;
for ( int64_t i = 0 ; i < n ; + + i ) {
m_datasets . push_back ( gguf_get_arr_str ( ctx_gguf , datasets_key , i ) ) ;
}
}
const std : : string in_sum2_suffix { " .in_sum2 " } ;
const std : : string counts_suffix { " .counts " } ;
2024-09-06 17:17:25 -04:00
2024-09-10 11:31:49 -04:00
// Could re-use m_stats instead, but this allows
// checking for completeness of *each* loaded imatrix file
// and also makes it easier to re-use a similar implementation in quantize.cpp
// Using an ordered map to get a deterministic iteration order.
std : : map < std : : string , std : : pair < struct ggml_tensor * , struct ggml_tensor * > > sums_counts_for ;
for ( struct ggml_tensor * cur = ggml_get_first_tensor ( ctx ) ; cur ; cur = ggml_get_next_tensor ( ctx , cur ) ) {
std : : string name = cur - > name ;
if ( name . empty ( ) ) { continue ; }
2025-04-15 17:29:57 -04:00
if ( str_remove_suffix ( name , in_sum2_suffix ) ) {
// in_sum2
sums_counts_for [ std : : move ( name ) ] . first = cur ;
2024-09-10 11:31:49 -04:00
} else if ( str_remove_suffix ( name , counts_suffix ) ) {
// counts
2025-04-15 17:29:57 -04:00
sums_counts_for [ std : : move ( name ) ] . second = cur ;
2024-09-10 11:31:49 -04:00
} else {
2025-04-15 17:29:57 -04:00
// ignore other tensors
2024-02-04 10:39:58 +02:00
}
2024-09-10 11:31:49 -04:00
}
for ( const auto & sc : sums_counts_for ) {
2025-04-15 17:29:57 -04:00
const std : : string & name = sc . first ;
const struct ggml_tensor * in_sum2 = sc . second . first ;
const struct ggml_tensor * counts = sc . second . second ;
2024-09-06 17:17:25 -04:00
2025-04-15 17:29:57 -04:00
if ( ! in_sum2 | | ! counts ) {
2025-02-09 12:06:15 -05:00
LOG_ERR ( " %s: mismatched sums and counts for %s \n " , __func__ , name . c_str ( ) ) ;
2024-09-06 17:17:25 -04:00
gguf_free ( ctx_gguf ) ;
ggml_free ( ctx ) ;
2024-02-04 10:39:58 +02:00
return false ;
}
2024-05-08 01:24:16 +01:00
2024-09-06 17:17:25 -04:00
auto & e = m_stats [ name ] ;
2025-04-15 17:29:57 -04:00
int64_t nval = ggml_nelements ( in_sum2 ) ;
2024-05-08 01:24:16 +01:00
if ( e . values . empty ( ) ) {
2025-04-15 17:29:57 -04:00
e . values . resize ( nval , 0.0f ) ;
2024-09-10 11:31:49 -04:00
} else if ( ( size_t ) nval ! = e . values . size ( ) ) {
2025-02-09 12:06:15 -05:00
LOG_ERR ( " %s: mismatched sums size for %s: %zu != %zu \n " , __func__ , name . c_str ( ) , ( size_t ) nval , e . values . size ( ) ) ;
2024-09-10 11:31:49 -04:00
gguf_free ( ctx_gguf ) ;
ggml_free ( ctx ) ;
return false ;
2024-05-08 01:24:16 +01:00
}
2024-09-10 11:31:49 -04:00
int64_t ncounts = ggml_nelements ( counts ) ;
2024-09-06 17:17:25 -04:00
if ( e . counts . empty ( ) ) {
e . counts . resize ( ncounts , 0 ) ;
} else if ( e . counts . size ( ) = = 1 & & ncounts > 1 ) {
// broadcast, when loading an old imatrix
e . counts . resize ( ncounts , e . counts [ 0 ] ) ;
2024-09-10 11:31:49 -04:00
} else if ( ( size_t ) ncounts ! = e . counts . size ( ) ) {
2025-02-09 12:06:15 -05:00
LOG_ERR ( " %s: mismatched counts size for %s: %zu != %zu \n " , __func__ , name . c_str ( ) , ( size_t ) ncounts , e . counts . size ( ) ) ;
2024-09-10 11:31:49 -04:00
gguf_free ( ctx_gguf ) ;
ggml_free ( ctx ) ;
return false ;
2024-02-04 10:39:58 +02:00
}
2024-05-08 01:24:16 +01:00
2024-09-06 17:17:25 -04:00
// Recreate the state as expected by save_imatrix()
2024-09-10 11:31:49 -04:00
for ( int64_t j = 0 ; j < nval ; j + + ) {
2025-04-15 17:29:57 -04:00
e . values [ j ] + = ( ( const float * ) in_sum2 - > data ) [ j ] ;
2024-09-06 17:17:25 -04:00
}
2024-09-10 11:31:49 -04:00
for ( int64_t j = 0 ; j < ncounts ; j + + ) {
2024-09-06 17:17:25 -04:00
e . counts [ j ] + = std : : lround ( ( ( const float * ) counts - > data ) [ j ] ) ;
2024-05-08 01:24:16 +01:00
}
2024-02-04 10:39:58 +02:00
}
2025-04-15 17:29:57 -04:00
// TODO: extract into its own method; this is also used by the legacy format
// Calculate the last chunk count
int64_t max_count = 0 ;
for ( const auto & stats : m_stats ) {
for ( int64_t count : stats . second . counts ) {
if ( count > max_count ) {
max_count = count ;
}
}
}
m_last_chunk = max_count / ( m_params . n_ctx / m_params . n_parallel ) ;
2024-09-06 17:17:25 -04:00
gguf_free ( ctx_gguf ) ;
ggml_free ( ctx ) ;
2024-02-04 10:39:58 +02:00
return true ;
}
2024-01-12 06:59:57 +01:00
static IMatrixCollector g_collector ;
2024-01-17 18:46:30 +02:00
static bool ik_collect_imatrix ( struct ggml_tensor * t , bool ask , void * user_data ) {
return g_collector . collect_imatrix ( t , ask , user_data ) ;
2024-01-12 06:59:57 +01:00
}
struct results_log_softmax {
double log_softmax ;
float logit ;
float prob ;
} ;
2024-06-06 16:30:58 +03:00
static std : : vector < float > softmax ( const std : : vector < float > & logits ) {
2024-01-12 06:59:57 +01:00
std : : vector < float > probs ( logits . size ( ) ) ;
float max_logit = logits [ 0 ] ;
for ( float v : logits ) {
max_logit = std : : max ( max_logit , v ) ;
}
double sum_exp = 0.0 ;
for ( size_t i = 0 ; i < logits . size ( ) ; i + + ) {
// Subtract the maximum logit value from the current logit value for numerical stability
const float logit = logits [ i ] - max_logit ;
const float exp_logit = expf ( logit ) ;
sum_exp + = exp_logit ;
probs [ i ] = exp_logit ;
}
for ( size_t i = 0 ; i < probs . size ( ) ; i + + ) {
probs [ i ] / = sum_exp ;
}
return probs ;
}
static results_log_softmax log_softmax ( int n_vocab , const float * logits , int tok ) {
float max_logit = logits [ 0 ] ;
for ( int i = 1 ; i < n_vocab ; + + i ) {
max_logit = std : : max ( max_logit , logits [ i ] ) ;
}
double sum_exp = 0.0 ;
for ( int i = 0 ; i < n_vocab ; + + i ) {
sum_exp + = expf ( logits [ i ] - max_logit ) ;
}
return { logits [ tok ] - max_logit - log ( sum_exp ) , logits [ tok ] , expf ( logits [ tok ] - max_logit ) / ( float ) sum_exp } ;
}
static void process_logits (
int n_vocab , const float * logits , const int * tokens , int n_token , std : : vector < std : : thread > & workers ,
2024-06-06 16:30:58 +03:00
double & nll , double & nll2 , float * logit_history , float * prob_history ) {
2024-01-12 06:59:57 +01:00
std : : mutex mutex ;
int counter = 0 ;
auto compute = [ & mutex , & counter , & nll , & nll2 , logit_history , prob_history , n_vocab , logits , tokens , n_token ] ( ) {
double local_nll = 0 ;
double local_nll2 = 0 ;
while ( true ) {
std : : unique_lock < std : : mutex > lock ( mutex ) ;
int i = counter + + ;
if ( i > = n_token ) {
nll + = local_nll ; nll2 + = local_nll2 ;
break ;
}
lock . unlock ( ) ;
const results_log_softmax results = log_softmax ( n_vocab , logits + i * n_vocab , tokens [ i + 1 ] ) ;
const double v = - results . log_softmax ;
local_nll + = v ;
local_nll2 + = v * v ;
logit_history [ i ] = results . logit ;
prob_history [ i ] = results . prob ;
}
} ;
for ( auto & w : workers ) {
w = std : : thread ( compute ) ;
}
compute ( ) ;
for ( auto & w : workers ) {
w . join ( ) ;
}
}
2025-02-09 12:06:15 -05:00
static bool compute_imatrix ( llama_context * ctx , const common_params & params , const int32_t n_ctx ) {
2025-01-12 11:32:42 +02:00
const llama_model * model = llama_get_model ( ctx ) ;
const llama_vocab * vocab = llama_model_get_vocab ( model ) ;
const bool add_bos = llama_vocab_get_add_bos ( vocab ) ;
2024-01-12 06:59:57 +01:00
2025-01-12 11:32:42 +02:00
GGML_ASSERT ( ! llama_vocab_get_add_eos ( vocab ) ) ;
2024-01-12 06:59:57 +01:00
auto tim1 = std : : chrono : : high_resolution_clock : : now ( ) ;
2024-09-15 20:46:12 +03:00
LOG_INF ( " %s: tokenizing the input .. \n " , __func__ ) ;
2024-01-12 06:59:57 +01:00
2025-05-09 05:53:58 -04:00
std : : vector < llama_token > tokens = common_tokenize ( ctx , params . prompt , true , params . parse_special ) ;
2024-01-12 06:59:57 +01:00
auto tim2 = std : : chrono : : high_resolution_clock : : now ( ) ;
2024-09-15 20:46:12 +03:00
LOG_INF ( " %s: tokenization took %g ms \n " , __func__ , 1e-3 * std : : chrono : : duration_cast < std : : chrono : : microseconds > ( tim2 - tim1 ) . count ( ) ) ;
2024-01-12 06:59:57 +01:00
2024-06-06 16:30:58 +03:00
if ( params . i_chunk > 0 ) {
if ( size_t ( ( params . i_chunk + 2 ) * n_ctx ) > = tokens . size ( ) ) {
2024-09-15 20:46:12 +03:00
LOG_ERR ( " %s: there will be not enough tokens left after removing %d chunks \n " , __func__ , params . i_chunk ) ;
2024-02-04 10:39:58 +02:00
return false ;
}
2024-09-15 20:46:12 +03:00
LOG_INF ( " %s: removing initial %d chunks (%d tokens) \n " , __func__ , params . i_chunk , params . i_chunk * n_ctx ) ;
2024-06-06 16:30:58 +03:00
tokens . erase ( tokens . begin ( ) , tokens . begin ( ) + params . i_chunk * n_ctx ) ;
2024-02-04 10:39:58 +02:00
}
2024-01-12 06:59:57 +01:00
if ( int ( tokens . size ( ) ) < 2 * n_ctx ) {
2024-09-15 20:46:12 +03:00
LOG_ERR ( " %s: you need at least %d tokens for a context of %d tokens \n " , __func__ , 2 * n_ctx , n_ctx ) ;
LOG_ERR ( " %s: the data file you provided tokenizes to only %zu tokens \n " , __func__ , tokens . size ( ) ) ;
2024-01-12 06:59:57 +01:00
return false ;
}
std : : vector < float > logit_history ;
std : : vector < float > prob_history ;
2024-01-21 08:01:20 +02:00
2024-06-06 16:30:58 +03:00
if ( params . compute_ppl ) {
2024-01-21 08:01:20 +02:00
logit_history . resize ( tokens . size ( ) ) ;
prob_history . resize ( tokens . size ( ) ) ;
}
2024-01-12 06:59:57 +01:00
const int n_chunk_max = tokens . size ( ) / n_ctx ;
const int n_chunk = params . n_chunks < 0 ? n_chunk_max : std : : min ( params . n_chunks , n_chunk_max ) ;
2025-01-12 11:32:42 +02:00
const int n_vocab = llama_vocab_n_tokens ( vocab ) ;
2024-01-12 06:59:57 +01:00
const int n_batch = params . n_batch ;
int count = 0 ;
double nll = 0.0 ;
double nll2 = 0.0 ;
2024-01-21 08:01:20 +02:00
const int num_batches = ( n_ctx + n_batch - 1 ) / n_batch ;
2024-08-20 15:17:24 -04:00
const int n_seq = std : : max ( 1 , n_batch / n_ctx ) ;
GGML_ASSERT ( n_batch < n_ctx | | n_batch % n_ctx = = 0 ) ;
GGML_ASSERT ( params . n_ctx = = n_seq * n_ctx ) ;
llama_batch batch = llama_batch_init ( std : : min ( n_batch , n_ctx * n_seq ) , 0 , 1 ) ;
2024-01-21 08:01:20 +02:00
std : : vector < float > logits ;
2024-06-06 16:30:58 +03:00
if ( params . compute_ppl & & num_batches > 1 ) {
2024-01-21 08:01:20 +02:00
logits . reserve ( ( size_t ) n_ctx * n_vocab ) ;
}
2025-02-09 12:06:15 -05:00
LOG_INF ( " %s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d \n " , __func__ , n_chunk , n_ctx , n_batch , n_seq ) ;
std : : vector < std : : thread > workers ( std : : thread : : hardware_concurrency ( ) - 1 ) ;
2024-08-20 15:17:24 -04:00
for ( int i = 0 ; i < n_chunk ; i + = n_seq ) {
2024-01-12 06:59:57 +01:00
const int start = i * n_ctx ;
const int end = start + n_ctx ;
2024-08-20 15:17:24 -04:00
const int n_seq_batch = std : : min ( n_seq , n_chunk - i ) ;
2024-01-12 06:59:57 +01:00
const auto t_start = std : : chrono : : high_resolution_clock : : now ( ) ;
// clear the KV cache
2025-06-06 14:11:15 +03:00
llama_memory_clear ( llama_get_memory ( ctx ) , true ) ;
2024-01-12 06:59:57 +01:00
for ( int j = 0 ; j < num_batches ; + + j ) {
const int batch_start = start + j * n_batch ;
const int batch_size = std : : min ( end - batch_start , n_batch ) ;
2024-08-20 15:17:24 -04:00
// clear the batch
2025-02-09 12:06:15 -05:00
common_batch_clear ( batch ) ;
2024-08-20 15:17:24 -04:00
for ( int seq = 0 ; seq < n_seq_batch ; seq + + ) {
int seq_start = batch_start + seq * n_ctx ;
// save original token and restore it after eval
const auto token_org = tokens [ seq_start ] ;
2024-01-12 06:59:57 +01:00
2024-08-20 15:17:24 -04:00
// add BOS token for the first batch of each chunk
if ( add_bos & & j = = 0 ) {
2025-02-09 12:06:15 -05:00
tokens [ seq_start ] = llama_vocab_bos ( vocab ) ;
2024-08-20 15:17:24 -04:00
}
for ( int k = 0 ; k < batch_size ; + + k ) {
// NOTE: specifying all logits to get activations for the output.weight tensor
// and also for the perplexity calculation.
// TODO: only get outputs when (params.process_output || params.compute_ppl)
// (not possible when this skips FFN computation of the last layer)
2025-02-09 12:06:15 -05:00
common_batch_add ( batch , tokens [ seq_start + k ] , j * n_batch + k , { seq } , true ) ;
2024-08-20 15:17:24 -04:00
}
2025-04-15 17:48:06 -04:00
2024-08-20 15:17:24 -04:00
// restore the original token in case it was set to BOS
tokens [ seq_start ] = token_org ;
2024-01-12 06:59:57 +01:00
}
2024-08-20 15:17:24 -04:00
if ( llama_decode ( ctx , batch ) ) {
2024-09-15 20:46:12 +03:00
LOG_ERR ( " %s : failed to eval \n " , __func__ ) ;
2024-10-18 23:18:01 +02:00
llama_batch_free ( batch ) ;
2024-01-12 06:59:57 +01:00
return false ;
}
2024-06-06 16:30:58 +03:00
if ( params . compute_ppl & & num_batches > 1 ) {
2024-01-21 08:01:20 +02:00
const auto * batch_logits = llama_get_logits ( ctx ) ;
logits . insert ( logits . end ( ) , batch_logits , batch_logits + batch_size * n_vocab ) ;
}
2024-01-12 06:59:57 +01:00
}
if ( i = = 0 ) {
2024-08-20 15:17:24 -04:00
llama_synchronize ( ctx ) ;
const auto t_end = std : : chrono : : high_resolution_clock : : now ( ) ;
2024-01-12 06:59:57 +01:00
const float t_total = std : : chrono : : duration < float > ( t_end - t_start ) . count ( ) ;
2024-09-15 20:46:12 +03:00
LOG_INF ( " %s: %.2f seconds per pass - ETA " , __func__ , t_total ) ;
2025-02-09 12:06:15 -05:00
int total_seconds = ( int ) ( t_total * n_chunk / n_seq ) ;
2024-01-12 06:59:57 +01:00
if ( total_seconds > = 60 * 60 ) {
2024-09-15 20:46:12 +03:00
LOG ( " %d hours " , total_seconds / ( 60 * 60 ) ) ;
2024-01-12 06:59:57 +01:00
total_seconds = total_seconds % ( 60 * 60 ) ;
}
2024-09-15 20:46:12 +03:00
LOG ( " %.2f minutes \n " , total_seconds / 60.0 ) ;
2024-01-12 06:59:57 +01:00
}
2024-06-06 16:30:58 +03:00
if ( params . compute_ppl ) {
2024-01-21 08:01:20 +02:00
const int first = n_ctx / 2 ;
2024-08-20 15:17:24 -04:00
for ( int seq = 0 ; seq < n_seq_batch ; seq + + ) {
2024-08-20 15:35:56 -04:00
const float * all_logits = num_batches > 1 ? logits . data ( ) : llama_get_logits_ith ( ctx , seq * n_ctx ) ;
2024-08-20 15:17:24 -04:00
llama_token * tokens_data = tokens . data ( ) + start + seq * n_ctx + first ;
2024-01-21 08:01:20 +02:00
2024-08-20 15:17:24 -04:00
process_logits ( n_vocab , all_logits + first * n_vocab ,
tokens_data , n_ctx - 1 - first ,
workers , nll , nll2 ,
logit_history . data ( ) + start + seq * n_ctx + first ,
prob_history . data ( ) + start + seq * n_ctx + first ) ;
count + = n_ctx - first - 1 ;
2025-02-09 12:06:15 -05:00
LOG ( " [%d]%.4lf, " , i + seq + 1 , std : : exp ( nll / count ) ) ;
2024-08-20 15:17:24 -04:00
}
2024-01-21 08:01:20 +02:00
fflush ( stdout ) ;
2024-01-12 06:59:57 +01:00
2024-01-21 08:01:20 +02:00
logits . clear ( ) ;
}
2024-01-12 06:59:57 +01:00
}
2025-02-09 12:06:15 -05:00
2024-09-15 20:46:12 +03:00
LOG ( " \n " ) ;
2024-01-12 06:59:57 +01:00
2024-06-06 16:30:58 +03:00
if ( params . compute_ppl ) {
2024-01-21 08:01:20 +02:00
nll2 / = count ;
nll / = count ;
const double ppl = exp ( nll ) ;
nll2 - = nll * nll ;
if ( nll2 > 0 ) {
nll2 = sqrt ( nll2 / ( count - 1 ) ) ;
2024-09-15 20:46:12 +03:00
LOG ( " Final estimate: PPL = %.4lf +/- %.5lf \n " , ppl , nll2 * ppl ) ;
2024-01-21 08:01:20 +02:00
} else {
2024-09-15 20:46:12 +03:00
LOG ( " Unexpected negative standard deviation of log(prob) \n " ) ;
2024-01-21 08:01:20 +02:00
}
2024-01-12 06:59:57 +01:00
}
2025-02-09 12:06:15 -05:00
llama_batch_free ( batch ) ;
2024-01-12 06:59:57 +01:00
return true ;
}
int main ( int argc , char * * argv ) {
2024-10-10 22:57:42 +02:00
common_params params ;
2024-06-06 16:30:58 +03:00
2025-04-15 17:48:06 -04:00
params . out_file = " imatrix.gguf " ;
2025-03-10 12:34:13 +01:00
2024-06-06 16:30:58 +03:00
params . n_ctx = 512 ;
2024-09-19 09:58:14 +02:00
params . escape = false ;
2024-06-04 21:23:39 +03:00
2024-10-10 22:57:42 +02:00
if ( ! common_params_parse ( argc , argv , params , LLAMA_EXAMPLE_IMATRIX , print_usage ) ) {
2024-04-26 20:06:33 +02:00
return 1 ;
}
2024-10-10 22:57:42 +02:00
common_init ( ) ;
2024-09-15 20:46:12 +03:00
2024-08-20 15:17:24 -04:00
const int32_t n_ctx = params . n_ctx ;
if ( n_ctx < = 0 ) {
2025-02-09 12:06:15 -05:00
LOG_ERR ( " %s: imatrix tool requires '--ctx-size' > 0 \n " , __func__ ) ;
2024-08-20 15:17:24 -04:00
return 1 ;
}
{
const int32_t n_seq = std : : max ( 1 , params . n_batch / n_ctx ) ;
const int32_t n_kv = n_seq * n_ctx ;
params . n_parallel = n_seq ;
params . n_ctx = n_kv ;
params . n_batch = std : : min ( params . n_batch , n_kv ) ;
}
2024-04-26 20:06:33 +02:00
2024-06-06 16:30:58 +03:00
g_collector . set_params ( params ) ;
2024-02-04 10:39:58 +02:00
2024-06-06 16:30:58 +03:00
for ( const auto & in_file : params . in_files ) {
2024-09-15 20:46:12 +03:00
LOG_INF ( " %s : loading imatrix from '%s' \n " , __func__ , in_file . c_str ( ) ) ;
2024-06-06 16:30:58 +03:00
if ( ! g_collector . load_imatrix ( in_file . c_str ( ) ) ) {
2024-09-15 20:46:12 +03:00
LOG_ERR ( " %s : failed to load %s \n " , __func__ , in_file . c_str ( ) ) ;
2024-02-04 10:39:58 +02:00
return 1 ;
}
}
2024-06-06 16:30:58 +03:00
if ( params . in_files . size ( ) > 1 ) {
2024-09-15 20:46:12 +03:00
LOG_INF ( " %s : saving combined imatrix to '%s' \n " , __func__ , params . out_file . c_str ( ) ) ;
2024-06-06 16:30:58 +03:00
g_collector . save_imatrix ( ) ;
2024-02-04 10:39:58 +02:00
}
2024-02-16 01:31:07 -08:00
llama_backend_init ( ) ;
llama_numa_init ( params . numa ) ;
2024-01-12 06:59:57 +01:00
2024-01-17 18:46:30 +02:00
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
2024-04-11 14:51:07 +02:00
params . cb_eval = ik_collect_imatrix ;
params . cb_eval_user_data = NULL ;
params . warmup = false ;
// init
2024-10-10 22:57:42 +02:00
common_init_result llama_init = common_init_from_params ( params ) ;
2024-06-06 16:30:58 +03:00
2025-01-03 10:18:53 +02:00
llama_model * model = llama_init . model . get ( ) ;
llama_context * ctx = llama_init . context . get ( ) ;
2024-06-06 16:30:58 +03:00
2024-04-11 14:51:07 +02:00
if ( model = = nullptr | | ctx = = nullptr ) {
2024-09-15 20:46:12 +03:00
LOG_ERR ( " %s : failed to init \n " , __func__ ) ;
2024-01-17 18:46:30 +02:00
return 1 ;
}
2025-01-12 11:32:42 +02:00
const int n_ctx_train = llama_model_n_ctx_train ( model ) ;
2024-01-12 06:59:57 +01:00
if ( params . n_ctx > n_ctx_train ) {
2024-09-15 20:46:12 +03:00
LOG_WRN ( " %s: model was trained on only %d context tokens (%d specified) \n " ,
2024-01-12 06:59:57 +01:00
__func__ , n_ctx_train , params . n_ctx ) ;
}
// print system information
{
2024-09-15 20:46:12 +03:00
LOG_INF ( " \n " ) ;
2024-10-10 22:57:42 +02:00
LOG_INF ( " %s \n " , common_params_get_system_info ( params ) . c_str ( ) ) ;
2024-01-12 06:59:57 +01:00
}
2024-11-29 12:21:37 -05:00
if ( params . prompt . empty ( ) ) {
if ( params . in_files . empty ( ) ) {
LOG_ERR ( " Error: No prompt provided and no precomputed matrices (--in-file) to combine. \n " ) ;
return 1 ;
}
LOG_INF ( " No prompt provided; combining precomputed matrices only. \n " ) ;
} else {
2025-02-09 12:06:15 -05:00
if ( ! compute_imatrix ( ctx , params , n_ctx ) ) {
2024-11-29 12:21:37 -05:00
return 1 ;
}
2024-01-12 06:59:57 +01:00
}
g_collector . save_imatrix ( ) ;
2024-09-15 20:46:12 +03:00
LOG ( " \n " ) ;
2024-09-13 09:53:38 +03:00
llama_perf_context_print ( ctx ) ;
2024-01-12 06:59:57 +01:00
llama_backend_free ( ) ;
return 0 ;
}