ci : integrate with ggml-org/ci (#2250)

* ci : run ctest ggml-ci * ci : add open llama 3B-v2 tests ggml-ci * ci : disable wget progress output ggml-ci * ci : add open llama 3B-v2 tg tests for q4 and q5 quantizations ggml-ci * tests : try to fix tail free sampling test ggml-ci * ci : add K-quants ggml-ci * ci : add short perplexity tests ggml-ci * ci : add README.md * ppl : add --chunks argument to limit max number of chunks ggml-ci * ci : update README
2025-08-15 12:42:40 -04:00 · 2023-07-18 14:24:43 +03:00
parent 6cbf9dfb32
commit d01bccde9f
8 changed files with 312 additions and 6 deletions
--- a/examples/common.h
+++ b/examples/common.h
@@ -28,6 +28,7 @@ struct gpt_params {
    int32_t n_ctx                           = 512; // context size
    int32_t n_batch                         = 512; // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                          = 0;   // number of tokens to keep from initial prompt
+    int32_t n_chunks                        = -1;  // max number of chunks to process (-1 = unlimited)
    int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
    int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs