Merge branch 'master' into compilade/mamba2

This commit is contained in:
Francis Couture-Harpin
2025-05-01 17:43:53 -04:00
462 changed files with 75597 additions and 48146 deletions

View File

@ -13,6 +13,7 @@ Checks: >
-readability-magic-numbers,
-readability-uppercase-literal-suffix,
-readability-simplify-boolean-expr,
-readability-math-missing-parentheses,
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,

View File

@ -14,9 +14,9 @@ WORKDIR /app
COPY . .
RUN if [ "$TARGETARCH" = "amd64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
elif [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
else \
echo "Unsupported architecture"; \
exit 1; \

View File

@ -21,7 +21,7 @@ COPY . .
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@ -1,4 +1,4 @@
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
FROM ascendai/cann:$ASCEND_VERSION AS build
@ -6,7 +6,7 @@ WORKDIR /app
COPY . .
RUN yum install -y gcc g++ cmake make
RUN yum install -y gcc g++ cmake make libcurl-devel
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
@ -22,7 +22,7 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
cmake --build build --config Release --target llama-cli
# TODO: use image with NNRT

View File

@ -35,7 +35,7 @@ COPY . .
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@ -17,8 +17,8 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# gfx906 is deprecated
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
ARG ROCM_DOCKER_ARCH=gfx1100
ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
#ARG ROCM_DOCKER_ARCH=gfx1100
# Set nvcc architectured
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
@ -40,7 +40,7 @@ WORKDIR /app
COPY . .
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
&& cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib \

View File

@ -16,7 +16,7 @@ WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@ -0,0 +1,25 @@
name: 'Windows - Setup CURL'
description: 'Composite action, to be reused in other workflow'
inputs:
curl_version:
description: 'CURL version'
required: false
default: '8.6.0_6'
outputs:
curl_path:
description: "Path to the downloaded libcurl"
value: ${{ steps.get_libcurl.outputs.curl_path }}
runs:
using: "composite"
steps:
- name: libCURL
id: get_libcurl
shell: powershell
env:
CURL_VERSION: ${{ inputs.curl_version }}
run: |
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
mkdir $env:RUNNER_TEMP/libcurl
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT

View File

@ -104,7 +104,6 @@ jobs:
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DLLAMA_CUBLAS=ON \
-DCUDAToolkit_ROOT=/usr/local/cuda \
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \

124
.github/workflows/build-linux-cross.yml vendored Normal file
View File

@ -0,0 +1,124 @@
name: Build on Linux using cross-compiler
on:
workflow_dispatch:
workflow_call:
jobs:
ubuntu-latest-riscv64-cpu-cross:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Riscv
run: |
sudo dpkg --add-architecture riscv64
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
sudo apt-get clean
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
libcurl4-openssl-dev:riscv64
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
ubuntu-latest-riscv64-vulkan-cross:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Riscv
run: |
sudo dpkg --add-architecture riscv64
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
sudo apt-get clean
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
libvulkan-dev:riscv64 \
libcurl4-openssl-dev:riscv64
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
ubuntu-latest-arm64-vulkan-cross:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Arm64
run: |
sudo dpkg --add-architecture arm64
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
sudo apt-get clean
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
crossbuild-essential-arm64 \
libvulkan-dev:arm64 \
libcurl4-openssl-dev:arm64
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=aarch64 \
-DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
-DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)

View File

@ -10,7 +10,7 @@ on:
push:
branches:
- master
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
@ -54,6 +54,7 @@ jobs:
continue-on-error: true
run: |
brew update
brew install curl
- name: Build
id: cmake_build
@ -62,7 +63,6 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON
@ -92,7 +92,6 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
- name: Upload artifacts
@ -123,6 +122,7 @@ jobs:
continue-on-error: true
run: |
brew update
brew install curl
- name: Build
id: cmake_build
@ -133,7 +133,6 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
@ -162,7 +161,6 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
- name: Upload artifacts
@ -207,7 +205,6 @@ jobs:
run: |
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
@ -246,7 +243,6 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
- name: Upload artifacts
@ -281,7 +277,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Build
id: cmake_build
@ -322,7 +318,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Build
id: cmake_build
@ -360,7 +356,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Build
id: cmake_build
@ -397,7 +393,7 @@ jobs:
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
- name: Build
id: cmake_build
@ -431,7 +427,6 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
- name: Upload artifacts
@ -454,7 +449,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
@ -467,6 +462,7 @@ jobs:
run: |
cmake -B build -S . \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGGML_HIP=ON
cmake --build build --config Release -j $(nproc)
@ -476,6 +472,7 @@ jobs:
cmake -B build2 -S . \
-DCMAKE_C_COMPILER=hipcc \
-DCMAKE_CXX_COMPILER=hipcc \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGGML_HIP=ON
cmake --build build2 --config Release -j $(nproc)
@ -528,7 +525,7 @@ jobs:
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp
sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
- name: install oneAPI MKL library
shell: bash
@ -576,7 +573,7 @@ jobs:
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp
sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
- name: install oneAPI MKL library
shell: bash
@ -604,6 +601,10 @@ jobs:
-DGGML_SYCL_F16=ON
cmake --build build --config Release -j $(nproc)
# Disabled for now due to sporadic issue syncing.
# build-linux-cross:
# uses: ./.github/workflows/build-linux-cross.yml
macOS-latest-cmake-ios:
runs-on: macos-latest
@ -631,6 +632,7 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_COMMON=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
@ -666,6 +668,7 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_COMMON=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
@ -674,6 +677,36 @@ jobs:
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
macOS-latest-cmake-visionos:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_COMMON=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
macOS-latest-swift:
runs-on: macos-latest
@ -705,17 +738,17 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
sudo cmake --install build --config Release
- name: xcodebuild for swift package
id: xcodebuild
run: |
xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
./build-xcframework.sh
windows-msys2:
runs-on: windows-latest
@ -773,7 +806,7 @@ jobs:
env:
OPENBLAS_VERSION: 0.3.23
SDE_VERSION: 9.33.0-2024-01-07
VULKAN_VERSION: 1.3.261.1
VULKAN_VERSION: 1.4.309.0
strategy:
matrix:
@ -866,10 +899,17 @@ jobs:
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build-arm64-release --target install --config release
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cmake -S . -B build ${{ matrix.defines }}
cmake -S . -B build ${{ matrix.defines }} `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add libopenblas.dll
@ -929,9 +969,10 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts
@ -957,7 +998,7 @@ jobs:
DEBIAN_FRONTEND: noninteractive
run: |
apt update
apt install -y cmake build-essential ninja-build libgomp1 git
apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
@ -1059,16 +1100,23 @@ jobs:
run: |
choco install ninja
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
shell: cmd
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" ^
-DLLAMA_BUILD_SERVER=ON ^
-DGGML_NATIVE=OFF ^
-DGGML_CUDA=ON ^
-DGGML_RPC=ON
-DGGML_RPC=ON ^
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
@ -1089,7 +1137,10 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
@ -1144,6 +1195,8 @@ jobs:
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
# TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
run: examples/sycl/win-build-sycl.bat
@ -1203,6 +1256,11 @@ jobs:
id: checkout
uses: actions/checkout@v4
- name: Clone rocWMMA repository
id: clone_rocwmma
run: |
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
- name: Install
id: depends
run: |
@ -1224,19 +1282,29 @@ jobs:
key: ${{ github.job }}
evict-old-files: 1d
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-DCMAKE_BUILD_TYPE=Release `
-DGGML_HIP=ON `
-DGGML_RPC=ON
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_RPC=ON `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
# TODO: reuse windows-latest-cmake-hip instead of duplicating this job
windows-latest-cmake-hip-release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
runs-on: windows-latest
@ -1252,6 +1320,11 @@ jobs:
with:
fetch-depth: 0
- name: Clone rocWMMA repository
id: clone_rocwmma
run: |
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
@ -1273,18 +1346,27 @@ jobs:
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-DCMAKE_BUILD_TYPE=Release `
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_HIP=ON `
-DGGML_RPC=ON
-DGGML_RPC=ON `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
@ -1306,7 +1388,10 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
- name: Upload artifacts
@ -1321,6 +1406,8 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Build
id: cmake_build
@ -1329,6 +1416,7 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
@ -1336,15 +1424,40 @@ jobs:
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
sudo cmake --install build --config Release
- name: xcodebuild for swift package
id: xcodebuild
run: |
xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
./build-xcframework.sh
- name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
name: llama-${{ steps.tag.outputs.name }}-xcframework
android-build:
runs-on: ubuntu-latest
@ -1654,16 +1767,17 @@ jobs:
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
defaults:
run:
shell: bash -el {0}
runs-on: ubuntu-24.04-arm
shell: bash -el {0}
strategy:
matrix:
arch: [x86, aarch64]
cann:
- '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
- '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
device:
- 'ascend910b3'
build:
- 'Release'
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
container: ascendai/cann:${{ matrix.cann }}
steps:
- name: Checkout
@ -1672,7 +1786,7 @@ jobs:
- name: Dependencies
run: |
yum update -y
yum install -y git gcc gcc-c++ make cmake
yum install -y git gcc gcc-c++ make cmake libcurl-devel
- name: Build
run: |

View File

@ -36,13 +36,13 @@ jobs:
matrix:
config:
# Multi-stage build
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
steps:
- name: Check out the repo
uses: actions/checkout@v4

View File

@ -129,7 +129,6 @@ jobs:
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ;
@ -142,7 +141,6 @@ jobs:
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
@ -154,13 +152,14 @@ jobs:
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
env:
GITHUB_ACTIONS: "true"
run: |
cd examples/server/tests
./tests.sh
@ -193,17 +192,14 @@ jobs:
- name: libCURL
id: get_libcurl
env:
CURL_VERSION: 8.6.0_6
run: |
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
mkdir $env:RUNNER_TEMP/libcurl
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup
@ -219,8 +215,10 @@ jobs:
- name: Copy Libcurl
id: prepare_libcurl
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
- name: Tests
id: server_integration_tests

2
.gitignore vendored
View File

@ -45,6 +45,8 @@ lcov-report/
tags
.build/
build*
release
debug
!build-info.cmake
!build-info.cpp.in
!build-info.sh

61
AUTHORS
View File

@ -1,4 +1,4 @@
# date: Tue Feb 4 13:04:05 EET 2025
# date: Sat Mar 8 18:23:52 EET 2025
# this file is auto-generated by scripts/gen-authors.sh
0cc4m <picard12@live.de>
@ -8,10 +8,12 @@
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
44670 <44670@users.noreply.github.com>
65a <10104049+65a@users.noreply.github.com>
708-145 <40387547+708-145@users.noreply.github.com>
AN Long <aisk@users.noreply.github.com>
AT <manyoso@users.noreply.github.com>
Aarni Koskela <akx@iki.fi>
Aaron Miller <apage43@ninjawhale.com>
Aaron Teo <57927438+taronaeo@users.noreply.github.com>
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
Abheek Gulati <abheekg@hotmail.com>
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
@ -20,6 +22,7 @@ Adithya Balaji <adithya.b94@gmail.com>
AdithyanI <adithyan.i4internet@gmail.com>
Adrian <smith.adriane@gmail.com>
Adrian Hesketh <a-h@users.noreply.github.com>
Adrian Kretz <me@akretz.com>
Adrien Gallouët <adrien@gallouet.fr>
Adrien Gallouët <angt@huggingface.co>
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
@ -28,15 +31,18 @@ AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
AidanBeltonS <aidan.belton@codeplay.com>
Aisuko <urakiny@gmail.com>
Akarshan Biswas <akarshan.biswas@gmail.com>
Akarshan Biswas <akarshan@menlo.ai>
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
Al Mochkin <14274697+amochkin@users.noreply.github.com>
Albert Jin <albert.jin@gmail.com>
Alberto <57916483+albbus-stack@users.noreply.github.com>
Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
Alberto Cabrera Pérez <alberto.cabrera@intel.com>
Aleksei Nikiforov <103434461+AlekseiNikiforovIBM@users.noreply.github.com>
Alex <awhill19@icloud.com>
Alex Azarov <alex@azarov.by>
Alex Azarov <alexander.azarov@mapbox.com>
Alex Brooks <alex.brooks@ibm.com>
Alex Klinkhamer <from.github.com.917@grencez.dev>
Alex Klinkhamer <git@grencez.dev>
Alex Nguyen <tiendung@users.noreply.github.com>
@ -67,6 +73,7 @@ Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
Andy Salerno <andysalerno@gmail.com>
Andy Tai <andy-tai@users.noreply.github.com>
Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
Antoine Viallon <antoine@lesviallon.fr>
Antonis Makropoulos <benuix@gmail.com>
Arik Poznanski <arikpoz@users.noreply.github.com>
Armen Kaleshian <kriation@users.noreply.github.com>
@ -83,6 +90,7 @@ Atsushi Tatsuma <yoshoku@outlook.com>
Austin <77757836+teleprint-me@users.noreply.github.com>
AustinMroz <austinmroz@utexas.edu>
BADR <contact@pythops.com>
BB-fat <45072480+BB-fat@users.noreply.github.com>
Bach Le <bach@bullno1.com>
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
@ -101,6 +109,7 @@ Bert Wagner <github@bertwagner.com>
Billel Mokeddem <billel.mokeddem.ml@gmail.com>
Bingan <70050083+binganao@users.noreply.github.com>
Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
Bodhi <3882561+BodhiHu@users.noreply.github.com>
Bodo Graumann <mail@bodograumann.de>
Bono Lv <lvscar@users.noreply.github.com>
Borislav Stanimirov <b.stanimirov@abv.bg>
@ -128,6 +137,7 @@ CentricStorm <CentricStorm@users.noreply.github.com>
Chad Brewbaker <crb002@gmail.com>
Changyeon Kim <cyzero.kim@samsung.com>
Chao Jiang <jc19chaoj@zoho.com>
Charles Duffy <charles@dyfis.net>
Charles Xu <63788048+chaxu01@users.noreply.github.com>
Charles Xu <charles.xu@arm.com>
Chen Xi <xi2.chen@intel.com>
@ -139,12 +149,14 @@ Chris Kuehl <ckuehl@ckuehl.me>
Christian Demsar <christian@github.email.demsar.us>
Christian Demsar <crasm@git.vczf.us>
Christian Falch <875252+chrfalch@users.noreply.github.com>
Christian Fillion <cfillion@users.noreply.github.com>
Christian Kastner <ckk@kvr.at>
Christian Kögler <ck3d@gmx.de>
Christian Köhnenkamp <cvk5@me.com>
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
Christopher Nielsen <62156882+mascguy@users.noreply.github.com>
Clark Saben <76020733+csaben@users.noreply.github.com>
Clauszy <zhangyub@uniontech.com>
Clint Herron <hanclinto@gmail.com>
Conrad Kramer <conrad@conradkramer.com>
Corentin REGAL <corentin.regal@gmail.com>
@ -163,6 +175,7 @@ Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Daniel Illescas Romero <illescas.daniel@protonmail.com>
Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
Daniele <57776841+daniandtheweb@users.noreply.github.com>
Danny Milosavljevic <dannym@friendly-machines.com>
DannyDaemonic <DannyDaemonic@gmail.com>
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
Dave <dave-fl@users.noreply.github.com>
@ -170,6 +183,7 @@ Dave Airlie <airlied@gmail.com>
Dave Airlie <airlied@redhat.com>
Dave Della Costa <ddellacosta+github@gmail.com>
David Friehs <david@friehs.info>
David Huang <1969802+hjc4869@users.noreply.github.com>
David Kennedy <dakennedyd@gmail.com>
David Pflug <david@pflug.email>
David Renshaw <dwrenshaw@gmail.com>
@ -236,6 +250,7 @@ Felix <stenbackfelix@gmail.com>
Finn Voorhees <finnvoorhees@gmail.com>
Firat <firatkiral@gmail.com>
FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
Florent BENOIT <fbenoit@redhat.com>
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
@ -254,6 +269,7 @@ Gary Mulder <gjmulder@gmail.com>
Gavin Zhao <gavinzhaojw@protonmail.com>
Genkagaku.GPT <hlhr202@163.com>
Georgi Gerganov <ggerganov@gmail.com>
Gian-Carlo Pascutto <gcp@sjeng.org>
Gilad S <giladgd@users.noreply.github.com>
Gilad S. <7817232+giladgd@users.noreply.github.com>
Giuseppe Scrivano <giuseppe@scrivano.org>
@ -267,7 +283,9 @@ Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
Haggai Nuchi <h.nuchi@gmail.com>
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
Hale Chan <halechan@qq.com>
Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
Han Yin <han.yin@arm.com>
HanishKVC <hanishkvc@gmail.com>
Haohui Mai <ricetons@gmail.com>
Haoxiang Fei <tonyfettes@tonyfettes.com>
@ -278,6 +296,7 @@ Haus1 <haus.xda@gmail.com>
Henk Poley <HenkPoley@gmail.com>
Henri Vasserman <henv@hot.ee>
Henrik Forstén <henrik.forsten@gmail.com>
Henry Linjamäki <henry.linjamaki@gmail.com>
Herman Semenov <GermanAizek@yandex.ru>
Hesen Peng <hesen.peng@gmail.com>
HimariO <dsfhe49854@gmail.com>
@ -307,6 +326,7 @@ Ivan <nekotekina@gmail.com>
Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
Ivan Komarov <Ivan.Komarov@dfyz.info>
Ivan Stepanov <ivanstepanovftw@gmail.com>
JC <43374599+MrSMlT@users.noreply.github.com>
JFLFY2255 <JFLFY2255@163.com>
JH23X <165871467+JH23X@users.noreply.github.com>
Jack Mousseau <jack@software.inc>
@ -325,6 +345,7 @@ Jan Ploski <jpl@plosquare.com>
Jannis Schönleber <joennlae@gmail.com>
Jared Van Bortel <cebtenzzre@gmail.com>
Jared Van Bortel <jared@nomic.ai>
Jason C.H <ctrysbita@outlook.com>
Jason McCartney <jmac@theroot.org>
Jason Stillerman <jason.t.stillerman@gmail.com>
Jean-Christophe Hoelt <hoelt@fovea.cc>
@ -342,6 +363,7 @@ Jiahao Li <liplus17@163.com>
Jian Liao <jianliao@users.noreply.github.com>
JidongZhang-THU <1119708529@qq.com>
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
Jinyang He <hejinyang@loongson.cn>
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
Jiří Sejkora <Sejseloid@gmail.com>
Joan Fontanals <jfontanalsmartinez@gmail.com>
@ -379,6 +401,7 @@ Justine Tunney <jtunney@mozilla.com>
Juuso Alasuutari <juuso.alasuutari@gmail.com>
KASR <karim.asrih@gmail.com>
Kamil Tomšík <info@tomsik.cz>
Kante Yin <kerthcet@gmail.com>
Karol Kontny <82021046+kkontny@users.noreply.github.com>
Karsten Weiss <knweiss@gmail.com>
Karthick <j.karthic2004@gmail.com>
@ -419,6 +442,7 @@ LoganDark <github@logandark.mozmail.com>
Loïc Carrère <loic.carrere@gmail.com>
LostRuins <39025047+LostRuins@users.noreply.github.com>
LostRuins Concedo <39025047+LostRuins@users.noreply.github.com>
Lucas Moura Belo <lucas.belo@live.com>
Luciano <lucianostrika44@gmail.com>
Luo Tian <lt@basecity.com>
Lyle Dean <dean@lyle.dev>
@ -463,6 +487,7 @@ Matthew Tejo <matthew.tejo@gmail.com>
Matvey Soloviev <blackhole89@gmail.com>
Max Krasnyansky <max.krasnyansky@gmail.com>
Max Krasnyansky <quic_maxk@quicinc.com>
Maxim Evtush <154841002+maximevtush@users.noreply.github.com>
Maxime <672982+maximegmd@users.noreply.github.com>
Maximilian Winter <maximilian.winter.91@gmail.com>
Meng Zhang <meng@tabbyml.com>
@ -494,6 +519,7 @@ Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
Molly Sophia <mollysophia379@gmail.com>
MoonRide303 <130458190+MoonRide303@users.noreply.github.com>
MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
Murilo Santana <mvrilo@gmail.com>
Musab Gultekin <musabgultekin@users.noreply.github.com>
@ -524,6 +550,7 @@ Nikolas <127742645+nneubacher@users.noreply.github.com>
Nindaleth <Nindaleth@users.noreply.github.com>
Nuno <rare-magma@posteo.eu>
OSecret <135510162+OLSecret@users.noreply.github.com>
Oleksandr Kuvshynov <661042+okuvshynov@users.noreply.github.com>
Oleksandr Nikitin <oleksandr@tvori.info>
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
Olivier Chafik <ochafik@users.noreply.github.com>
@ -533,6 +560,7 @@ PAB <pierreantoine.bannier@gmail.com>
Pablo Duboue <pablo.duboue@gmail.com>
Pascal Patry <ppatry@mtacitlabs.com>
Patrice Ferlet <metal3d@gmail.com>
Patrick Peng <retr0@retr0.blog>
Paul Tsochantaris <ptsochantaris@icloud.com>
Pavel Zloi <github.com@drteam.rocks>
Pavol Rusnak <pavol@rusnak.io>
@ -549,6 +577,7 @@ Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
Plamen Minev <pacominev@gmail.com>
Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
Przemysław Pawełczyk <przemoc@gmail.com>
PureJourney <edward.pong@qq.com>
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
Qingyou Meng <meng.qingyou@gmail.com>
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
@ -564,14 +593,17 @@ Rand Xie <randxiexyy29@gmail.com>
Randall Fitzgerald <randall@dasaku.net>
Random Fly <renfei8@live.cn>
Reinforce-II <fate@eastal.com>
Rémy O <remyoudompheng@gmail.com>
Rémy Oudompheng <oudomphe@phare.normalesup.org>
Ren Xuancheng <jklj077@users.noreply.github.com>
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
Reza Kakhki <rezakakhki.de@gmail.com>
Reza Rahemtola <49811529+RezaRahemtola@users.noreply.github.com>
RhinoDevel <RhinoDevel@users.noreply.github.com>
Riccardo Orlando <Riccorl@users.noreply.github.com>
Riceball LEE <snowyu.lee@gmail.com>
Rich Dougherty <rich@rd.nz>
Richard <r-burton@hotmail.co.uk>
Richard Kiss <him@richardkiss.com>
Richard Roberson <richardr1126@gmail.com>
Rick G <26732651+TheFlipbook@users.noreply.github.com>
@ -588,6 +620,7 @@ Robert Sung-wook Shin <edp1096@users.noreply.github.com>
Robey Holderith <robey@flaminglunchbox.net>
Robyn <robyngraf@users.noreply.github.com>
Roger Meier <r.meier@siemens.com>
Rohanjames1997 <rohan.james4@gmail.com>
Roland <14355895+rbur0425@users.noreply.github.com>
Romain Biessy <romain.biessy@codeplay.com>
Romain D <90720+Artefact2@users.noreply.github.com>
@ -610,6 +643,7 @@ Ryan Landay <rlanday@gmail.com>
Ryder Wishart <ryderwishart@gmail.com>
Ryuei <louixs@users.noreply.github.com>
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
SAMI <samuel.koesnadi@stud.uni-due.de>
SRHMorris <69468379+SRHMorris@users.noreply.github.com>
SXX <sxx1136965276@gmail.com>
SakuraUmi <yukinon244@gmail.com>
@ -634,6 +668,8 @@ Shane A <shanea@allenai.org>
Shangning Xu <32517059+xushangning@users.noreply.github.com>
Shankar <gshankar.87@gmail.com>
Shanshan Shen <467638484@qq.com>
Shelby Jenkins <47464908+ShelbyJenkins@users.noreply.github.com>
Sheldon Robinson <sheldon.robinson@live.com>
Shijie <821898965@qq.com>
Shintarou Okada <kokuzen@gmail.com>
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
@ -713,18 +749,24 @@ Victor Nogueira <felladrin@gmail.com>
Victor Z. Peng <ziliangdotme@gmail.com>
Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
Vitali Lovich <vlovich+github@gmail.com>
Vivian <vynride@gmail.com>
Vlad <spitfireage@gmail.com>
Vladimir <bogdad@gmail.com>
Vladimir Malyutin <first-leon@yandex.ru>
Vladimir Vuksanovic <109677816+vvuksanovic@users.noreply.github.com>
Vladimir Zorin <vladimir@deviant.guru>
VoidIsVoid <343750470@qq.com>
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
Wagner Bruna <wbruna@users.noreply.github.com>
Wang Qin <37098874+wangqin0@users.noreply.github.com>
Wang Ran (汪然) <wangr@smail.nju.edu.cn>
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
Weird Constructor <weirdconstructor@gmail.com>
Weizhao Ouyang <o451686892@gmail.com>
Welby Seely <welbyseely@gmail.com>
Wentai Zhang <rchardx@gmail.com>
Wilken Gottwalt <12194808+wgottwalt@users.noreply.github.com>
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
William Tambellini <william.tambellini@gmail.com>
William Tambellini <wtambellini@sdl.com>
@ -816,6 +858,8 @@ chaihahaha <chai836275709@gmail.com>
chiranko <96988916+chiranko@users.noreply.github.com>
clibdev <52199778+clibdev@users.noreply.github.com>
clyang <clyang@clyang.net>
cmdr2 <secondary.cmdr2@gmail.com>
cmdr2 <shashank.shekhar.global@gmail.com>
cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
codezjx <code.zjx@gmail.com>
coezbek <c.oezbek@gmail.com>
@ -835,6 +879,7 @@ deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
devojony <61173062+devojony@users.noreply.github.com>
ditsuke <ditsuke@protonmail.com>
divinity76 <divinity76@gmail.com>
dm4 <dm4@secondstate.io>
dm4 <sunrisedm4@gmail.com>
dotpy314 <33351922+dotpy314@users.noreply.github.com>
drbh <david.richard.holtz@gmail.com>
@ -849,6 +894,7 @@ fairydreaming <166155368+fairydreaming@users.noreply.github.com>
fengerhu1 <2748250768@qq.com>
fj-y-saito <85871716+fj-y-saito@users.noreply.github.com>
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
fxzjshm <11426482+fxzjshm@users.noreply.github.com>
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
gliptic <gliptic@users.noreply.github.com>
gn64 <yukikaze.jp@gmail.com>
@ -873,6 +919,7 @@ hydai <z54981220@gmail.com>
iSma <ismail.senhaji@gmail.com>
iacore <74560659+iacore@users.noreply.github.com>
icppWorld <124377669+icppWorld@users.noreply.github.com>
igardev <49397134+igardev@users.noreply.github.com>
igarnier <igarnier@protonmail.com>
intelmatt <61025942+intelmatt@users.noreply.github.com>
iohub <rickyang.pro@gmail.com>
@ -880,6 +927,7 @@ issixx <46835150+issixx@users.noreply.github.com>
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
jameswu2014 <545426914@qq.com>
jason_w <jason.wang@126.com>
jdomke <28772296+jdomke@users.noreply.github.com>
jiahao su <damow890@gmail.com>
jiez <373447296@qq.com>
@ -891,6 +939,7 @@ jon-chuang <9093549+jon-chuang@users.noreply.github.com>
jp-x-g <jpxg-dev@protonmail.com>
jukofyork <69222624+jukofyork@users.noreply.github.com>
junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
junchao-zhao <68935141+junchao-loongson@users.noreply.github.com>
jwj7140 <32943891+jwj7140@users.noreply.github.com>
k.h.lai <adrian.k.h.lai@outlook.com>
kaizau <kaizau@users.noreply.github.com>
@ -925,6 +974,7 @@ ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
luoyu-intel <yu.luo@intel.com>
m3ndax <adrian.goessl@outlook.com>
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
magicse <magicse@users.noreply.github.com>
mahorozte <41834471+mahorozte@users.noreply.github.com>
makomk <makosoft@googlemail.com>
manikbhandari <mbbhandarimanik2@gmail.com>
@ -935,6 +985,7 @@ matt23654 <matthew.webber@protonmail.com>
matteo <matteogeniaccio@yahoo.it>
mdrokz <mohammadmunshi@gmail.com>
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
midnight <midnightmagic@users.noreply.github.com>
minarchist <minarchist@users.noreply.github.com>
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
mmyjona <jonathan.gonse@gmail.com>
@ -958,10 +1009,12 @@ omahs <73983677+omahs@users.noreply.github.com>
oobabooga <112222186+oobabooga@users.noreply.github.com>
opparco <parco.opaai@gmail.com>
ostix360 <55257054+ostix360@users.noreply.github.com>
pascal-lc <49066376+pascal-lc@users.noreply.github.com>
pculliton <phillipculliton@gmail.com>
peidaqi <peidaqi@gmail.com>
pengxin99 <pengxin.yuan@intel.com>
perserk <perserk@gmail.com>
petterreinholdtsen <pere-github@hungry.com>
piDack <104877312+piDack@users.noreply.github.com>
pmysl <piotr.myslinski@outlook.com>
postmasters <namnguyen@google.com>
@ -983,6 +1036,7 @@ semidark <me@semidark.net>
serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
sharpHL <132747147+sharpHL@users.noreply.github.com>
shibe2 <shibe@tuta.io>
simon886212 <37953122+simon886212@users.noreply.github.com>
singularity <12184989+singularity-s0@users.noreply.github.com>
sjinzh <sjinzh@gmail.com>
sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
@ -1000,10 +1054,12 @@ tarcey <cey.tarik@gmail.com>
tc-mb <157115220+tc-mb@users.noreply.github.com>
texmex76 <40733439+texmex76@users.noreply.github.com>
thement <40525767+thement@users.noreply.github.com>
theraininsky <76763719+theraininsky@users.noreply.github.com>
thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
tjohnman <tjohnman@users.noreply.github.com>
toyer <2042519524@qq.com>
tslmy <tslmy@users.noreply.github.com>
tv1wnd <55383215+tv1wnd@users.noreply.github.com>
ubik2 <ubik2@users.noreply.github.com>
uint256_t <konndennsa@gmail.com>
uint256_t <maekawatoshiki1017@gmail.com>
@ -1014,6 +1070,7 @@ valiray <133289098+valiray@users.noreply.github.com>
vb <vaibhavs10@gmail.com>
vik <vikhyatk@gmail.com>
viric <viric@viric.name>
vmobilis <75476228+vmobilis@users.noreply.github.com>
vodkaslime <646329483@qq.com>
vvhg1 <94630311+vvhg1@users.noreply.github.com>
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
@ -1028,6 +1085,8 @@ wzy <32936898+Freed-Wu@users.noreply.github.com>
xaedes <xaedes@gmail.com>
xaedes <xaedes@googlemail.com>
xctan <axunlei@gmail.com>
xiaobing318 <71554036+xiaobing318@users.noreply.github.com>
xiaofei <hbuxiaofei@gmail.com>
xloem <0xloem@gmail.com>
yangli2 <yangli2@gmail.com>
ymcki <84055651+ymcki@users.noreply.github.com>

View File

@ -29,6 +29,8 @@ else()
set(LLAMA_STANDALONE OFF)
endif()
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
@ -79,7 +81,7 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
# 3rd party libs
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
# Required for relocatable CMake package
@ -145,7 +147,13 @@ endif()
# 3rd-party
#
if (NOT TARGET ggml)
if (LLAMA_USE_SYSTEM_GGML)
message(STATUS "Using system-provided libggml, skipping ggml build")
find_package(ggml REQUIRED)
add_library(ggml ALIAS ggml::ggml)
endif()
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
@ -160,6 +168,11 @@ add_subdirectory(src)
# utils, programs, examples and tests
#
if (NOT LLAMA_BUILD_COMMON)
message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
set(LLAMA_CURL OFF)
endif()
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
endif()
@ -234,3 +247,20 @@ configure_file(cmake/llama.pc.in
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
#
# copy the license files
#
# Check if running in GitHub Actions
if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
message(STATUS "Running inside GitHub Actions - copying license files")
# Copy all files from licenses/ to build/bin/
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
foreach(LICENSE_FILE ${LICENSE_FILES})
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
endforeach()
endif()

View File

@ -39,7 +39,7 @@
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` (from clang-tools v15+) to format the added code
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$

View File

@ -780,10 +780,6 @@ ifdef GGML_HIP
MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
ifdef GGML_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA
endif # GGML_HIP_UMA
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
@ -836,7 +832,7 @@ ifdef GGML_MUSA
else
MUSA_PATH ?= /opt/musa
endif
MUSA_ARCHITECTURES ?= 21;22
MUSA_ARCHITECTURES ?= 21;22;31
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib

View File

@ -1,19 +0,0 @@
// swift-tools-version:5.5
import PackageDescription
let package = Package(
name: "llama",
platforms: [
.macOS(.v12),
.iOS(.v14),
.watchOS(.v4),
.tvOS(.v14)
],
products: [
.library(name: "llama", targets: ["llama"]),
],
targets: [
.systemLibrary(name: "llama", pkgConfig: "llama"),
]
)

View File

@ -5,17 +5,10 @@
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
[Roadmap](https://github.com/users/ggml-org/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
> [!IMPORTANT]
> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp)
>
> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp`
>
> More info: https://github.com/ggml-org/llama.cpp/discussions/11801
## Recent API changes
- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
@ -23,9 +16,10 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
## Hot topics
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
- Universal tool call support in `llama-server`: https://github.com/ggml-org/llama.cpp/pull/9639
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
@ -104,6 +98,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
- [x] [GLM-4-0414](https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e)
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
@ -112,6 +107,8 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
#### Multimodal
@ -157,6 +154,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
- Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
</details>
@ -171,6 +169,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
@ -243,6 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) | All |
## Building the project
@ -261,7 +261,9 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf <user>/<model>[:quant]`
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
After downloading a model, use the CLI tools to run it locally - see below.
@ -526,6 +528,35 @@ If your issue is with model generation quality, then please at least scan the fo
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
## XCFramework
The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS,
and macOS. It can be used in Swift projects without the need to compile the
library from source. For example:
```swift
// swift-tools-version: 5.10
// The swift-tools-version declares the minimum version of Swift required to build this package.
import PackageDescription
let package = Package(
name: "MyLlamaPackage",
targets: [
.executableTarget(
name: "MyLlamaPackage",
dependencies: [
"LlamaFramework"
]),
.binaryTarget(
name: "LlamaFramework",
url: "https://github.com/ggml-org/llama.cpp/releases/download/b5046/llama-b5046-xcframework.zip",
checksum: "c19be78b5f00d8d29a25da41042cb7afa094cbf6280a225abe614b03b20029ab"
)
]
)
```
The above example is using an intermediate build `b5046` of the library. This can be modified
to use a different version by changing the URL and checksum.
## Completions
Command-line completion is available for some environments.

View File

@ -40,7 +40,8 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
### Untrusted environments or networks
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
* Encrypt your data if sending it over the network.
### Multi-Tenant environments

View File

@ -1,4 +0,0 @@
#pragma once
#include <llama.h>

View File

@ -1,5 +0,0 @@
module llama [system] {
header "llama.h"
link "llama"
export *
}

538
build-xcframework.sh Executable file
View File

@ -0,0 +1,538 @@
#!/bin/bash
#
# Options
IOS_MIN_OS_VERSION=16.4
MACOS_MIN_OS_VERSION=13.3
VISIONOS_MIN_OS_VERSION=1.0
TVOS_MIN_OS_VERSION=16.4
BUILD_SHARED_LIBS=OFF
LLAMA_BUILD_EXAMPLES=OFF
LLAMA_BUILD_TESTS=OFF
LLAMA_BUILD_SERVER=OFF
GGML_METAL=ON
GGML_METAL_EMBED_LIBRARY=ON
GGML_BLAS_DEFAULT=ON
GGML_METAL_USE_BF16=ON
GGML_OPENMP=OFF
COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
# Common options for all builds
COMMON_CMAKE_ARGS=(
-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO
-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY=""
-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO
-DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym"
-DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES
-DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO
-DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
-DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
-DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
-DGGML_METAL=${GGML_METAL}
-DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16}
-DGGML_NATIVE=OFF
-DGGML_OPENMP=${GGML_OPENMP}
)
XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
echo "Detected Xcode version: $XCODE_VERSION"
check_required_tool() {
local tool=$1
local install_message=$2
if ! command -v $tool &> /dev/null; then
echo "Error: $tool is required but not found."
echo "$install_message"
exit 1
fi
}
echo "Checking for required tools..."
check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
set -e
## Clean up previous builds
rm -rf build-apple
rm -rf build-ios-sim
rm -rf build-ios-device
rm -rf build-macos
rm -rf build-visionos
rm -rf build-visionos-sim
rm -rf build-tvos-sim
rm -rf build-tvos-device
# Setup the xcframework build directory structure
setup_framework_structure() {
local build_dir=$1
local min_os_version=$2
local platform=$3 # "ios", "macos", "visionos", or "tvos"
local framework_name="llama"
echo "Creating ${platform}-style framework structure for ${build_dir}"
if [[ "$platform" == "macos" ]]; then
# macOS versioned structure uses versioned directories
mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers
mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules
mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources
# Create symbolic links
ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current
ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers
ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules
ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources
ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name}
# Set header and module paths
local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/
local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/
else
# iOS/VisionOS/tvOS use a flat structure
mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers
mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules
# Remove any existing structure to ensure clean build
rm -rf ${build_dir}/framework/${framework_name}.framework/Versions
# Set header and module paths
local header_path=${build_dir}/framework/${framework_name}.framework/Headers/
local module_path=${build_dir}/framework/${framework_name}.framework/Modules/
fi
# Copy all required headers (common for all platforms)
cp include/llama.h ${header_path}
cp ggml/include/ggml.h ${header_path}
cp ggml/include/ggml-alloc.h ${header_path}
cp ggml/include/ggml-backend.h ${header_path}
cp ggml/include/ggml-metal.h ${header_path}
cp ggml/include/ggml-cpu.h ${header_path}
cp ggml/include/ggml-blas.h ${header_path}
cp ggml/include/gguf.h ${header_path}
# Create module map (common for all platforms)
cat > ${module_path}module.modulemap << EOF
framework module llama {
header "llama.h"
header "ggml.h"
header "ggml-alloc.h"
header "ggml-backend.h"
header "ggml-metal.h"
header "ggml-cpu.h"
header "ggml-blas.h"
header "gguf.h"
link "c++"
link framework "Accelerate"
link framework "Metal"
link framework "Foundation"
export *
}
EOF
# Platform-specific settings for Info.plist
local platform_name=""
local sdk_name=""
local supported_platform=""
case "$platform" in
"ios")
platform_name="iphoneos"
sdk_name="iphoneos${min_os_version}"
supported_platform="iPhoneOS"
local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
local device_family=' <key>UIDeviceFamily</key>
<array>
<integer>1</integer>
<integer>2</integer>
</array>'
;;
"macos")
platform_name="macosx"
sdk_name="macosx${min_os_version}"
supported_platform="MacOSX"
local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist"
local device_family=""
;;
"visionos")
platform_name="xros"
sdk_name="xros${min_os_version}"
supported_platform="XRPlatform"
local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
local device_family=""
;;
"tvos")
platform_name="appletvos"
sdk_name="appletvos${min_os_version}"
supported_platform="AppleTVOS"
local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
local device_family=' <key>UIDeviceFamily</key>
<array>
<integer>3</integer>
</array>'
;;
esac
# Create Info.plist
cat > ${plist_path} << EOF
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleDevelopmentRegion</key>
<string>en</string>
<key>CFBundleExecutable</key>
<string>llama</string>
<key>CFBundleIdentifier</key>
<string>org.ggml.llama</string>
<key>CFBundleInfoDictionaryVersion</key>
<string>6.0</string>
<key>CFBundleName</key>
<string>llama</string>
<key>CFBundlePackageType</key>
<string>FMWK</string>
<key>CFBundleShortVersionString</key>
<string>1.0</string>
<key>CFBundleVersion</key>
<string>1</string>
<key>MinimumOSVersion</key>
<string>${min_os_version}</string>
<key>CFBundleSupportedPlatforms</key>
<array>
<string>${supported_platform}</string>
</array>${device_family}
<key>DTPlatformName</key>
<string>${platform_name}</string>
<key>DTSDKName</key>
<string>${sdk_name}</string>
</dict>
</plist>
EOF
}
# Create dynamic libraries from static libraries.
combine_static_libraries() {
local build_dir="$1"
local release_dir="$2"
local platform="$3" # "ios", "macos", "visionos", or "tvos"
local is_simulator="$4"
local base_dir="$(pwd)"
local framework_name="llama"
# Determine output path based on platform
local output_lib=""
if [[ "$platform" == "macos" ]]; then
# macOS uses versioned structure
output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}"
else
# iOS, visionOS, and tvOS use a directory flat structure
output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}"
fi
local libs=(
"${base_dir}/${build_dir}/src/${release_dir}/libllama.a"
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a"
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a"
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
"${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
"${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
)
# Create temporary directory for processing
local temp_dir="${base_dir}/${build_dir}/temp"
mkdir -p "${temp_dir}"
# Since we have multiple architectures libtool will find object files that do not
# match the target architecture. We suppress these warnings.
libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
# Determine SDK, architectures, and install_name based on platform and simulator flag.
local sdk=""
local archs=""
local min_version_flag=""
local install_name=""
case "$platform" in
"ios")
if [[ "$is_simulator" == "true" ]]; then
sdk="iphonesimulator"
archs="arm64 x86_64"
min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}"
else
sdk="iphoneos"
archs="arm64"
min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}"
fi
install_name="@rpath/llama.framework/llama"
;;
"macos")
sdk="macosx"
archs="arm64 x86_64"
min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}"
install_name="@rpath/llama.framework/Versions/Current/llama"
;;
"visionos")
if [[ "$is_simulator" == "true" ]]; then
sdk="xrsimulator"
archs="arm64 x86_64"
min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator"
else
sdk="xros"
archs="arm64"
min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}"
fi
# Use flat structure for visionOS, same as iOS
install_name="@rpath/llama.framework/llama"
;;
"tvos")
if [[ "$is_simulator" == "true" ]]; then
sdk="appletvsimulator"
archs="arm64 x86_64"
min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}"
else
sdk="appletvos"
archs="arm64"
min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}"
fi
install_name="@rpath/llama.framework/llama"
;;
esac
# Build architecture flags
local arch_flags=""
for arch in $archs; do
arch_flags+=" -arch $arch"
done
# Create dynamic library
echo "Creating dynamic library for ${platform}."
xcrun -sdk $sdk clang++ -dynamiclib \
-isysroot $(xcrun --sdk $sdk --show-sdk-path) \
$arch_flags \
$min_version_flag \
-Wl,-force_load,"${temp_dir}/combined.a" \
-framework Foundation -framework Metal -framework Accelerate \
-install_name "$install_name" \
-o "${base_dir}/${output_lib}"
# Platform-specific post-processing for device builds
if [[ "$is_simulator" == "false" ]]; then
if command -v xcrun vtool &>/dev/null; then
case "$platform" in
"ios")
echo "Marking binary as a framework binary for iOS..."
xcrun vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
;;
"visionos")
echo "Marking binary as a framework binary for visionOS..."
if [[ "$MAJOR_VERSION" -gt 16 ]] || [[ "$MAJOR_VERSION" -eq 16 && "$MINOR_VERSION" -gt 2 ]]; then
echo "Xcode version greater than 16.2, using visionOS."
VISION_OS_BUILD_VERSION="visionos"
else
echo "Xcode version less than or equal to 16.2, using xros."
VISION_OS_BUILD_VERSION="xros"
fi
xcrun vtool -set-build-version ${VISION_OS_BUILD_VERSION} ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
;;
"tvos")
echo "Marking binary as a framework binary for tvOS..."
xcrun vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
;;
esac
else
echo "Warning: vtool not found. Binary may not pass App Store validation."
fi
fi
echo "Creating properly formatted dSYM..."
# Create a separate directory for dSYMs for all platforms
mkdir -p "${base_dir}/${build_dir}/dSYMs"
# iOS and visionOS style dSYM (flat structure)
if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then
# Generate dSYM in the dSYMs directory
xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
# Create a copy of the binary that will be stripped
cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip"
# Strip debug symbols from the copy
xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib"
# Replace the original with the stripped version
mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
else
# macOS style dSYM
# First strip debug info to a separate file
xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib"
# Generate dSYM in the dSYMs directory
xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
# Replace original binary with stripped version
mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
fi
# Remove any automatically generated dSYM files in the framework structure as they will
# otherwise case Invalid Bundle Structure validation errors.
if [ -d "${base_dir}/${output_lib}.dSYM" ]; then
echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM"
rm -rf "${base_dir}/${output_lib}.dSYM"
fi
# Clean up
rm -rf "${temp_dir}"
}
echo "Building for iOS simulator..."
cmake -B build-ios-sim -G Xcode \
"${COMMON_CMAKE_ARGS[@]}" \
-DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
-DIOS=ON \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_SYSROOT=iphonesimulator \
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-ios-sim --config Release -- -quiet
echo "Building for iOS devices..."
cmake -B build-ios-device -G Xcode \
"${COMMON_CMAKE_ARGS[@]}" \
-DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
-DCMAKE_OSX_SYSROOT=iphoneos \
-DCMAKE_OSX_ARCHITECTURES="arm64" \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-ios-device --config Release -- -quiet
echo "Building for macOS..."
cmake -B build-macos -G Xcode \
"${COMMON_CMAKE_ARGS[@]}" \
-DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-macos --config Release -- -quiet
echo "Building for visionOS..."
cmake -B build-visionos -G Xcode \
"${COMMON_CMAKE_ARGS[@]}" \
-DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
-DCMAKE_OSX_ARCHITECTURES="arm64" \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_SYSROOT=xros \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-visionos --config Release -- -quiet
echo "Building for visionOS simulator..."
cmake -B build-visionos-sim -G Xcode \
"${COMMON_CMAKE_ARGS[@]}" \
-DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_SYSROOT=xrsimulator \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-visionos-sim --config Release -- -quiet
# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
echo "Building for tvOS simulator..."
cmake -B build-tvos-sim -G Xcode \
"${COMMON_CMAKE_ARGS[@]}" \
-DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
-DCMAKE_SYSTEM_NAME=tvOS \
-DCMAKE_OSX_SYSROOT=appletvsimulator \
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-DGGML_METAL=ON \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-tvos-sim --config Release -- -quiet
echo "Building for tvOS devices..."
cmake -B build-tvos-device -G Xcode \
"${COMMON_CMAKE_ARGS[@]}" \
-DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
-DCMAKE_SYSTEM_NAME=tvOS \
-DCMAKE_OSX_SYSROOT=appletvos \
-DCMAKE_OSX_ARCHITECTURES="arm64" \
-DGGML_METAL=ON \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-tvos-device --config Release -- -quiet
# Setup frameworks and copy binaries and headers
echo "Setting up framework structures..."
setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios"
setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios"
setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos"
setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos"
setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos"
setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos"
setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos"
# Create dynamic libraries from static libraries
echo "Creating dynamic libraries from static libraries..."
combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true"
combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false"
combine_static_libraries "build-macos" "Release" "macos" "false"
combine_static_libraries "build-visionos" "Release-xros" "visionos" "false"
combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true"
combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true"
combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
# Create XCFramework with correct debug symbols paths
echo "Creating XCFramework..."
xcodebuild -create-xcframework \
-framework $(pwd)/build-ios-sim/framework/llama.framework \
-debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
-framework $(pwd)/build-ios-device/framework/llama.framework \
-debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
-framework $(pwd)/build-macos/framework/llama.framework \
-debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
-framework $(pwd)/build-visionos/framework/llama.framework \
-debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
-framework $(pwd)/build-visionos-sim/framework/llama.framework \
-debug-symbols $(pwd)/build-visionos-sim/dSYMs/llama.dSYM \
-framework $(pwd)/build-tvos-device/framework/llama.framework \
-debug-symbols $(pwd)/build-tvos-device/dSYMs/llama.dSYM \
-framework $(pwd)/build-tvos-sim/framework/llama.framework \
-debug-symbols $(pwd)/build-tvos-sim/dSYMs/llama.dSYM \
-output $(pwd)/build-apple/llama.xcframework

View File

@ -26,4 +26,43 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with SYCL support
source /opt/intel/oneapi/setvars.sh
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with MUSA support
GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
```
## Running MUSA CI in a Docker Container
Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
### 1. Create a local directory to store cached models, configuration files and venv:
```bash
mkdir -p $HOME/llama.cpp/ci-cache
```
### 2. Create a local directory to store CI run results:
```bash
mkdir -p $HOME/llama.cpp/ci-results
```
### 3. Start a Docker container and run the CI:
```bash
docker run --privileged -it \
-v $HOME/llama.cpp/ci-cache:/ci-cache \
-v $HOME/llama.cpp/ci-results:/ci-results \
-v $PWD:/ws -w /ws \
mthreads/musa:rc3.1.1-devel-ubuntu22.04
```
Inside the container, execute the following commands:
```bash
apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
git config --global --add safe.directory /ws
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
```
This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.

View File

@ -16,6 +16,9 @@
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with MUSA support
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
@ -36,7 +39,7 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
@ -52,13 +55,24 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
echo "source /opt/intel/oneapi/setvars.sh"
exit 1
fi
# Use only main GPU
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
# Enable sysman for correct memory reporting
export ZES_ENABLE_SYSMAN=1
# to circumvent precision issues on CPY operations
export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
fi
if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
fi
if [ ! -z ${GG_BUILD_MUSA} ]; then
# Use qy1 by default (MTT S80)
MUSA_ARCH=${MUSA_ARCH:-21}
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
fi
## helpers
# download a file if it does not exist or if it is outdated
@ -352,10 +366,10 @@ function gg_run_open_llama_7b_v2 {
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@ -808,7 +822,7 @@ export LLAMA_LOG_PREFIX=1
export LLAMA_LOG_TIMESTAMPS=1
if [ -z ${GG_BUILD_LOW_PERF} ]; then
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models
rm -rf ${SRC}/models-mnt
mnt_models=${MNT}/models
mkdir -p ${mnt_models}
@ -826,8 +840,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
fi
ret=0
test $ret -eq 0 && gg_run ctest_debug
if [ -z ${GG_BUILD_SYCL} ]; then
# SYCL build breaks with debug build flags
test $ret -eq 0 && gg_run ctest_debug
fi
test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then
@ -835,7 +851,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run rerank_tiny
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
test $ret -eq 0 && gg_run test_scripts_debug
if [ -z ${GG_BUILD_SYCL} ]; then
test $ret -eq 0 && gg_run test_scripts_debug
fi
test $ret -eq 0 && gg_run test_scripts_release
fi
@ -846,7 +864,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run pythia_2_8b
#test $ret -eq 0 && gg_run open_llama_7b_v2
fi
test $ret -eq 0 && gg_run ctest_with_model_debug
if [ -z ${GG_BUILD_SYCL} ]; then
test $ret -eq 0 && gg_run ctest_with_model_debug
fi
test $ret -eq 0 && gg_run ctest_with_model_release
fi
fi

View File

@ -1,3 +1,5 @@
include("ggml/cmake/common.cmake")
function(llama_add_compile_flags)
if (LLAMA_FATAL_WARNINGS)
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")

View File

@ -85,7 +85,10 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
# Use curl to download model url
if (LLAMA_CURL)
find_package(CURL REQUIRED)
find_package(CURL)
if (NOT CURL_FOUND)
message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
endif()
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED)
@ -114,8 +117,8 @@ if (LLAMA_LLGUIDANCE)
ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
# v0.6.12:
GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
# v0.7.10:
GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
PREFIX ${CMAKE_BINARY_DIR}/llguidance
SOURCE_DIR ${LLGUIDANCE_SRC}
BUILD_IN_SOURCE TRUE

File diff suppressed because it is too large Load Diff

View File

@ -78,3 +78,12 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
// function to be used by test-arg-parser
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
bool common_has_curl();
struct common_remote_params {
std::vector<std::string> headers;
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
};
// get remote file content, returns <http_code, raw_response_body>
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);

View File

@ -60,7 +60,9 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
}
msg.role = message.at("role");
if (message.contains("content")) {
auto has_content = message.contains("content");
auto has_tool_calls = message.contains("tool_calls");
if (has_content) {
const auto & content = message.at("content");
if (content.is_string()) {
msg.content = content;
@ -81,19 +83,8 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
} else if (!content.is_null()) {
throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
}
} else {
throw std::runtime_error("Expected 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
}
if (message.contains("reasoning_content")) {
msg.reasoning_content = message.at("reasoning_content");
}
if (message.contains("name")) {
msg.tool_name = message.at("name");
}
if (message.contains("tool_call_id")) {
msg.tool_call_id = message.at("tool_call_id");
}
if (message.contains("tool_calls")) {
if (has_tool_calls) {
for (const auto & tool_call : message.at("tool_calls")) {
common_chat_tool_call tc;
if (!tool_call.contains("type")) {
@ -118,6 +109,18 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
msg.tool_calls.push_back(tc);
}
}
if (!has_content && !has_tool_calls) {
throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
}
if (message.contains("reasoning_content")) {
msg.reasoning_content = message.at("reasoning_content");
}
if (message.contains("name")) {
msg.tool_name = message.at("name");
}
if (message.contains("tool_call_id")) {
msg.tool_call_id = message.at("tool_call_id");
}
msgs.push_back(msg);
}
@ -442,6 +445,7 @@ std::string common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING: return "Hermes 2 Pro (extract reasoning)";
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)";
default:
@ -449,12 +453,6 @@ std::string common_chat_format_name(common_chat_format format) {
}
}
const common_grammar_options grammar_options {
/* .dotall = */ false,
/* .compact_spaces = */ false,
// /* .compact_spaces = */ true,
};
static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) {
// // https://json.nlohmann.me/features/parsing/sax_interface/
struct json_error_locator : public nlohmann::json_sax<json> {
@ -500,6 +498,34 @@ static bool parse_json(std::string::const_iterator & it, const std::string::cons
}
}
static bool parse_literal(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
auto expected_it = expected.begin();
auto tmp_it = it;
while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
++tmp_it;
++expected_it;
}
if (expected_it == expected.end()) {
it = tmp_it;
return true;
}
return false;
}
static std::optional<std::smatch> parse_pattern(std::string::const_iterator & it, const std::string::const_iterator & end, const std::regex & expected) {
std::smatch match;
if (std::regex_match(it, end, match, expected)) {
it = match.suffix().first;
return match;
}
return std::nullopt;
}
static void consume_spaces(std::string::const_iterator & it, const std::string::const_iterator & end) {
while (it != end && std::isspace(*it)) {
++it;
}
}
/**
* Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
@ -509,7 +535,8 @@ static common_chat_msg parse_json_tool_calls(
const std::string& input,
const std::optional<std::regex> & trigger_opt,
const std::regex & function_regex,
const std::regex & close_regex) {
const std::regex & close_regex,
bool allow_raw_python = false) {
std::smatch match;
common_chat_msg result;
@ -540,14 +567,19 @@ static common_chat_msg parse_json_tool_calls(
it = rit->suffix().first;
json arguments;
if (!parse_json(it, end, arguments)) {
if (parse_json(it, end, arguments)) {
if (!std::regex_search(it, end, match, close_regex)) {
throw std::runtime_error("Malformed input, missing closing pattern: " + input);
}
it = match.suffix().first;
result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
} else {
if (allow_raw_python && name == "python") {
result.tool_calls.push_back({name, json({{"code", std::string(it, end)}}).dump(), /* id= */ ""});
break;
}
throw std::runtime_error("Failed to parse json tool call arguments: " + input);
}
if (!std::regex_search(it, end, match, close_regex)) {
throw std::runtime_error("Malformed input, missing closing pattern: " + input);
}
it = match.suffix().first;
result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
}
if (!result.tool_calls.empty()) {
@ -559,29 +591,29 @@ static common_chat_msg parse_json_tool_calls(
return result;
}
static common_chat_tool_call process_tool_call(const json & tool_call) {
const auto & arguments = tool_call.at("arguments");
return {
/* .name = */ tool_call.at("name"),
/* .arguments = */ arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
/* .id = */ tool_call.contains("id") ? tool_call.at("id") : "",
};
}
static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) {
auto content_end = input.find(prefix);
size_t tc_start = std::string::npos;
common_chat_msg result;
result.role = "assistant";
const auto process_tool_calls = [&](const json & tool_calls) {
for (const auto & tool_call : tool_calls) {
const auto & arguments = tool_call.at("arguments");
result.tool_calls.push_back({
tool_call.at("name"),
arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
tool_call.contains("id") ? tool_call.at("id") : "",
});
}
};
if (content_end == std::string::npos) {
result.content = input;
} else {
tc_start = content_end + prefix.size() - rstrip_prefix;
result.content = input.substr(0, content_end);
auto tool_calls = json::parse(input.substr(tc_start));
process_tool_calls(tool_calls);
for (const auto & tool_call : tool_calls) {
result.tool_calls.emplace_back(process_tool_call(tool_call));
}
}
return result;
}
@ -700,7 +732,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
data.grammar_lazy = false;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
builder.add_schema("root", schema);
}, grammar_options);
});
auto tweaked_messages = common_chat_template::add_system(
inputs.messages,
@ -770,8 +802,11 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
schema["maxItems"] = 1;
}
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
}, grammar_options);
data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true});
});
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
data.preserved_tokens = {
"[TOOL_CALLS]",
};
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
return data;
@ -813,14 +848,18 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
schema["maxItems"] = 1;
}
builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
}, grammar_options);
data.grammar_triggers.push_back({"<|START_ACTION|>", /* .at_start = */ false});
});
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
"<|START_ACTION|>",
});
data.preserved_tokens = {
"<|START_ACTION|>",
"<|END_ACTION|>",
"<|START_RESPONSE|>",
"<|END_RESPONSE|>",
"<|START_THINKING|>",
"<|END_THINKING|>",
"<|END_ACTION|>",
};
auto adjusted_messages = json::array();
for (const auto & msg : inputs.messages) {
@ -840,9 +879,9 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
return data;
}
static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) {
static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|>)([\\s\\S\\n\\r]*)");
static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
static const std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)");
static const std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>");
static const std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>");
std::smatch match;
@ -945,23 +984,23 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
builder.add_rule(
name + "-call",
"\"{\" space "
"( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? "
"\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
builder.add_schema(name + "-args", parameters) +
" \"}\""));
data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
"( \"\\\"type\\\"\" space \":\" space \"\\\"function\\\"\" space \",\" space )? "
" \"\\\"name\\\"\" space \":\" space \"\\\"" + name + "\\\"\" space \",\" space "
" \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " "
"\"}\" space"));
});
// Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
"\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
});
data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
if (!builtin_tools.empty()) {
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
data.preserved_tokens.push_back("<|python_tag|>");
}
// Allow a few empty lines on top of the usual constrained json schema space rule.
builder.add_rule("root", string_join(tool_rules, " | "));
}, grammar_options);
});
data.additional_stops.push_back("<|eom_id|>");
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
{"tools_in_user_message", false},
@ -974,33 +1013,33 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
}
static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
// TODO: tighten & simplify the parser, don't accept leading text context.
static std::regex function_regex("\\{[\\s\\n\\r]*(?:\"type\"[\\s\\n\\r]*:[\\s\\n\\r]*\"function\"[\\s\\n\\r]*,[\\s\\n\\r]*|[\\s\\n\\r]*)\"name\"[\\s\\n\\r]*:[\\s\\n\\r]*\"([^\"]+)\"[\\s\\n\\r]*,[\\s\\n\\r]*\"parameters\": ");
static std::regex close_regex("\\}");
static std::regex builtin_call_regex("<\\|python_tag\\|>([^.(]+)\\.call\\((.*)\\)");
static const std::regex function_regex(
"\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
static const std::regex close_regex("\\}\\s*");
static const std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)");
if (with_builtin_tools) {
std::smatch match;
if (std::regex_match(input, match, builtin_call_regex)) {
auto name = match[1].str();
auto raw_args = match[2].str();
try {
auto name = match[1].str();
auto arg_name = match[2].str();
auto arg_value_str = match[3].str();
auto arg_value = json::parse(arg_value_str);
// TODO: if/when builtin tools start accepting more than 1 argument, use parse_json for real parsing.
auto it_eq = raw_args.find('=');
auto arg_name = raw_args.substr(0, it_eq);
auto arg_value_str = raw_args.substr(it_eq + 1);
auto arg_value = json::parse(arg_value_str);
common_chat_msg msg;
msg.role = "assistant";
msg.content = match.prefix().str();
msg.tool_calls.push_back({
/* .name = */ name,
/* .arguments = */ (json {
{arg_name, arg_value},
}).dump(),
/* .id = */ "",
});
return msg;
common_chat_msg msg;
msg.role = "assistant";
msg.tool_calls.push_back({
/* .name = */ name,
/* .arguments = */ (json {
{arg_name, arg_value},
}).dump(),
/* .id = */ "",
});
return msg;
} catch (const std::exception & e) {
LOG_WRN("Failed to parse builtin tool call arguments (%s): %s", e.what(), input.c_str());
}
}
}
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
@ -1017,10 +1056,10 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
std::string name = function.at("name");
auto parameters = function.at("parameters");
builder.resolve_refs(parameters);
auto args_rule = builder.add_schema(name + "-args", parameters);
tool_rules.push_back(builder.add_rule(name + "-call",
"\"<tool▁call▁begin>function<tool▁sep>" + name + "\\n"
"```json\\n\" " + args_rule + " \"```<tool▁call▁end>\""));
"```json\\n\" " + builder.add_schema(name + "-args", parameters) + " "
"\"```<tool▁call▁end>\""));
});
// Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
// so we accept common variants (then it's all constrained)
@ -1029,18 +1068,20 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
"(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
"\"<tool▁calls▁end>\""
" space");
data.grammar_triggers.push_back({"<tool▁calls▁begin>", /* .at_start = */ false});
data.grammar_triggers.push_back({"<tool_calls_begin>", /* .at_start = */ false});
data.grammar_triggers.push_back({"<tool calls begin>", /* .at_start = */ false});
data.grammar_triggers.push_back({"<tool\\_calls\\_begin>", /* .at_start = */ false});
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool▁calls▁begin>"});
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_calls_begin>"});
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool calls begin>"});
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool\\_calls\\_begin>"});
data.preserved_tokens = {
"<think>",
"</think>",
"<tool▁calls▁begin>",
"<tool▁call▁begin>",
"<tool▁sep>",
"<tool▁calls▁end",
"<tool▁call▁end>",
"<tool▁calls▁end",
};
}, grammar_options);
});
}
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
@ -1065,34 +1106,42 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING : COMMON_CHAT_FORMAT_DEEPSEEK_R1;
return data;
}
static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) {
static std::regex function_regex("<tool▁call▁begin>function<tool▁sep>([^\n]+)\n```json\n");
static std::regex close_regex("```[\\s\\r\\n]*<tool▁call▁end>");
static std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
static std::regex tool_calls_regex("[\\s\\r\\n]*(?:<tool▁calls▁begin>|<tool_calls_begin>|<tool calls begin>|<tool\\\\_calls\\\\_begin>)([\\s\\S\\r\\n]*?)<tool▁calls▁end>");
common_chat_msg msg;
msg.role = "assistant";
static common_chat_msg handle_think_tag_prelude(const std::string & input, bool extract_reasoning, const std::function<common_chat_msg(const std::string &)> & rest_parser) {
std::smatch match;
static const std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
if (std::regex_match(input, match, reasoning_content_regex)) {
std::string rest;
auto rest = match[3].str();
auto msg = rest_parser(rest);
auto reasoning_content = string_strip(match[2].str());
if (extract_reasoning) {
msg.reasoning_content = string_strip(match[2].str());
} else {
msg.content = match[1].str();
msg.reasoning_content = reasoning_content;
} else if (!reasoning_content.empty()) {
std::ostringstream content;
content << "<think>" << reasoning_content << "</think>" << msg.content;
msg.content = content.str();
}
rest = match[3].str();
return msg;
}
return rest_parser(input);
}
static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) {
return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
static const std::regex function_regex("<tool▁call▁begin>function<tool▁sep>([^\n]+)\n```json\n");
static const std::regex close_regex("```[\\s\\r\\n]*<tool▁call▁end>");
static const std::regex tool_calls_regex("[\\s\\r\\n]*(?:<tool▁calls▁begin>|<tool_calls_begin>|<tool calls begin>|<tool\\\\_calls\\\\_begin>)([\\s\\S\\r\\n]*?)<tool▁calls▁end>");
if (std::regex_search(rest, match, tool_calls_regex)) {
common_chat_msg msg;
msg.role = "assistant";
std::smatch match;
if (std::regex_search(input, match, tool_calls_regex)) {
auto tool_calls = match[1].str();
auto msg2 = parse_json_tool_calls(tool_calls, std::nullopt, function_regex, close_regex);
msg.tool_calls = std::move(msg2.tool_calls);
} else {
msg.content += std::string(rest.begin() + rest.find_first_not_of(" \r\n"), rest.end());
msg.content = input;
}
} else {
msg.content = input;
}
return msg;
return msg;
});
}
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
@ -1129,8 +1178,11 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
schema["maxItems"] = 1;
}
builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
}, grammar_options);
data.grammar_triggers.push_back({" functools[", /* .at_start = */ false});
});
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, " functools["});
data.preserved_tokens = {
" functools[",
};
data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2;
} else {
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
@ -1158,11 +1210,28 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
auto parameters = function.at("parameters");
builder.resolve_refs(parameters);
auto args_rule = builder.add_schema(name + "-args", parameters);
first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule));
first_tool_rules.push_back(builder.add_rule(name + "-call", "( \"assistant<|end_header_id|>\\n\" )? \"" + name + "\\n\" " + args_rule));
subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
data.grammar_triggers.push_back({name, /* .at_start = */ true});
data.grammar_triggers.push_back({">>>" + name, /* .at_start = */ false});
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
regex_escape(name + "\n"),
});
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
regex_escape("assistant<|end_header_id|>\n" + name + "\n"),
});
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
regex_escape(">>>" + name + "\n"),
});
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
">>>assistant<|end_header_id|>\n" + name,
});
});
data.preserved_tokens = {
"<|end_header_id|>",
};
auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space";
if (inputs.parallel_tool_calls) {
auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space";
@ -1171,34 +1240,20 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
builder.add_rule("root", first_rule);
}
}, grammar_options);
});
}
return data;
}
static bool consume(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
auto expected_it = expected.begin();
auto tmp_it = it;
while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
++tmp_it;
++expected_it;
}
if (expected_it == expected.end()) {
it = tmp_it;
return true;
}
return false;
}
static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
static std::regex function_regex(R"((?:>>>)?(\w+)\n)");
static std::regex close_regex(R"($|(?=>>>))");
static const std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)");
static const std::regex close_regex(R"($|(?=>>>))");
std::string content;
auto it = input.begin();
const auto end = input.end();
if (consume(it, end, "all\n")) {
if (parse_literal(it, end, "all\n")) {
std::smatch match;
if (std::regex_search(it, end, match, function_regex)) {
auto fun_it = match.prefix().second;
@ -1213,7 +1268,7 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in
}
// TODO: tighten & simplify.
try {
auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex, /* allow_raw_python= */ true);
res.content = content + res.content;
return res;
} catch (const std::exception & e) {
@ -1266,12 +1321,13 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
});
if (has_raw_python) {
tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
data.preserved_tokens.push_back("<|python_tag|>");
}
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
data.grammar_triggers.push_back({"<function=", /* .at_start = */ false});
}, grammar_options);
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function="});
});
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
// TODO: if (has_raw_python)
@ -1280,7 +1336,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
}
static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
// This version of Functionary still supports the llama 3.1 tool call format for the python tool.
static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
static const std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
std::smatch match;
if (std::regex_search(input, match, python_tag_regex)) {
auto code = match[1].str();
@ -1294,8 +1350,8 @@ static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::s
});
return msg;
}
static std::regex function_regex(R"(<function=(\w+)>)");
static std::regex close_regex(R"(</function>)");
static const std::regex function_regex(R"(<function=(\w+)>)");
static const std::regex close_regex(R"(</function>)");
// TODO: tighten & simplify.
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
}
@ -1306,6 +1362,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
std::vector<std::string> tool_rules;
std::vector<std::string> tool_call_alts;
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
std::string name = function.at("name");
@ -1319,68 +1376,187 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
}},
{"required", json::array({"name", "arguments"})},
}));
tool_call_alts.push_back(builder.add_rule(
name + "-function-tag",
"\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
builder.add_schema(name + "-args", parameters) + " "
"\"</function>\" space"));
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
"<function=" + name + ">",
});
auto escaped_name = regex_escape(name);
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
"<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
});
});
auto tool_call = "\"<tool_call>\" space " + builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " \"</tool_call>\" space";
auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
std::vector<std::string> alt_tags {
any_tool_call,
"\"<tool_call>\" space " + any_tool_call + " \"</tool_call>\"",
// The rest is just to accommodate common "good bad" outputs.
"\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
"\"<response>\" space " + any_tool_call + " \"</response>\"",
"\"<tools>\" space " + any_tool_call + " \"</tools>\"",
"\"<json>\" space " + any_tool_call + " \"</json>\"",
"\"<xml>\" space " + any_tool_call + " \"</xml>\"",
"\"<JSON>\" space " + any_tool_call + " \"</JSON>\"",
};
auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
tool_call_alts.push_back(wrappable_tool_call);
tool_call_alts.push_back(
"( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
data.grammar_triggers.push_back({"<tool_call>", /* .at_start = */ false});
data.preserved_tokens = { "</tool_call>" };
}, grammar_options);
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_call>"});
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function"});
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
"(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"",
});
data.preserved_tokens = {
"<think>",
"</think>",
"<tool_call>",
"</tool_call>",
"<function",
"<tools>",
"</tools>",
"<response>",
"</response>",
"<function_call>",
"</function_call>",
"<json>",
"</json>",
"<JSON>",
"</JSON>",
"```",
"```json",
"```xml",
};
});
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING : COMMON_CHAT_FORMAT_HERMES_2_PRO;
return data;
}
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) {
try {
std::regex start_pattern(R"([\n\s]*<tool_call>)");
std::regex middle_pattern(R"([\n\s]*</tool_call>[\n\s]*<tool_call>)");
std::regex end_pattern(R"([\n\s]*</tool_call>[\n\s]*$)");
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input, bool extract_reasoning) {
return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
static const std::regex open_regex(
"(?:"
"(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
"(<tool_call>" // match 2 (open_tag)
"|<function_call>"
"|<tool>"
"|<tools>"
"|<response>"
"|<json>"
"|<xml>"
"|<JSON>"
")?"
"(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)" // match 3 (named tool call + rest)
")"
"|"
"(?:<function=([^>]+)>" // match 4 (function name)
"|<function name=\"([^\"]+)\">)" // match 5 (function name again)
"([\\s\\S]*)" // match 6 (function arguments + rest)})"
);
common_chat_msg msg;
msg.role = "assistant";
try {
common_chat_msg msg;
msg.role = "assistant";
auto end = input.end();
std::sregex_iterator rend;
std::sregex_iterator rit(input.begin(), end, start_pattern);
if (rit == rend) {
std::string::const_iterator it = input.begin();
const std::string::const_iterator end = input.end();
std::smatch match;
while (it != end) {
if (std::regex_search(it, end, match, open_regex)) {
// Add content before the match
msg.content += std::string(it, match[0].first);
auto block_start = match[1].str();
std::string block_end = block_start.empty() ? "" : "```";
auto open_tag = match[2].str();
std::string close_tag;
if (match[3].matched) {
close_tag = open_tag.empty() ? "" : "</" + open_tag.substr(1);
auto json_it = match[3].first;
json tool_call;
if (parse_json(json_it, end, tool_call) && tool_call.contains("name") && tool_call.contains("arguments")) {
msg.tool_calls.emplace_back(process_tool_call(tool_call));
it = json_it; // Move iterator past parsed JSON
// Handle close tags
consume_spaces(it, end);
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
throw std::runtime_error("Failed to parse closing tag");
}
consume_spaces(it, end);
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
throw std::runtime_error("Failed to parse block end");
}
consume_spaces(it, end);
} else {
// Not a valid tool call, treat as content
msg.content += std::string(match[0].first, match[0].second);
it = match[0].second;
}
} else {
auto function_name = match[4].str();
if (function_name.empty()) {
function_name = match[5].str();
}
GGML_ASSERT(!function_name.empty());
close_tag = "</function>";
// Start parsing from after the opening tags
auto json_it = match[6].first;
json arguments;
if (parse_json(json_it, end, arguments)) {
msg.tool_calls.emplace_back(process_tool_call({
{"name", function_name},
{"arguments", arguments},
}));
it = json_it; // Move iterator past parsed JSON
// Handle close tags
consume_spaces(it, end);
if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
throw std::runtime_error("Failed to parse closing tag");
}
consume_spaces(it, end);
if (!block_end.empty() && !parse_literal(it, end, block_end)) {
throw std::runtime_error("Failed to parse block end");
}
consume_spaces(it, end);
} else {
// Not a valid tool call, treat as content
msg.content += std::string(match[0].first, match[0].second);
it = match[0].second;
}
}
} else {
// Add remaining content
msg.content += std::string(it, end);
break;
}
}
return msg;
} catch (const std::exception & e) {
LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what());
common_chat_msg msg;
msg.role = "assistant";
msg.content = input;
return msg;
}
msg.content = rit->prefix();
auto it = rit->suffix().first;
while (it != end) {
json call;
if (!parse_json(it, end, call)) {
throw std::runtime_error("Failed to parse json tool call");
}
const auto & arguments = call.at("arguments");
msg.tool_calls.push_back({
call.at("name"),
arguments.dump(),
// arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
/* id= */ "",
});
rit = {it, end, middle_pattern};
if (rit != rend) {
it = rit->suffix().first;
} else {
rit = {it, end, end_pattern};
if (rit == rend) {
throw std::runtime_error("Malformed input, missing </tool_call>");
}
break;
}
}
return msg;
} catch (const std::exception & e) {
LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what());
common_chat_msg msg;
msg.role = "assistant";
msg.content = input;
return msg;
}
});
}
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
@ -1445,6 +1621,11 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_command_r7b(tmpl, params);
}
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
return common_chat_params_init_hermes_2_pro(tmpl, params);
}
// Use generic handler when mixing tools + JSON schema.
// TODO: support that mix in handlers below.
if ((params.tools.is_array() && params.json_schema.is_object())) {
@ -1466,11 +1647,6 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_without_tools(tmpl, params);
}
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
if (src.find("<tool_call>") != std::string::npos) {
return common_chat_params_init_hermes_2_pro(tmpl, params);
}
// Functionary v3.1 (w/ tools)
if (src.find("<|start_header_id|>") != std::string::npos
&& src.find("<function=") != std::string::npos) {
@ -1588,7 +1764,9 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
return common_chat_parse_functionary_v3_1_llama_3_1(input);
case COMMON_CHAT_FORMAT_HERMES_2_PRO:
return common_chat_parse_hermes_2_pro(input);
return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ false);
case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING:
return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ true);
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
return common_chat_parse_firefunction_v2(input);
case COMMON_CHAT_FORMAT_COMMAND_R7B:

View File

@ -53,6 +53,7 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
COMMON_CHAT_FORMAT_HERMES_2_PRO,
COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
COMMON_CHAT_FORMAT_COMMAND_R7B,
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,

View File

@ -7,10 +7,6 @@
#include "common.h"
#include "log.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
#include "json-schema-to-grammar.h"
#include "llama.h"
#include <algorithm>
@ -52,47 +48,11 @@
#include <sys/stat.h>
#include <unistd.h>
#endif
#if defined(LLAMA_USE_CURL)
#include <curl/curl.h>
#include <curl/easy.h>
#include <future>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#if defined(LLAMA_USE_CURL)
#ifdef __linux__
#include <linux/limits.h>
#elif defined(_WIN32)
# if !defined(PATH_MAX)
# define PATH_MAX MAX_PATH
# endif
#else
#include <sys/syslimits.h>
#endif
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
//
// CURL utils
//
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
struct curl_slist_ptr {
struct curl_slist * ptr = nullptr;
~curl_slist_ptr() {
if (ptr) {
curl_slist_free_all(ptr);
}
}
};
#endif // LLAMA_USE_CURL
using json = nlohmann::ordered_json;
//
// CPU utils
//
@ -483,6 +443,11 @@ void string_replace_all(std::string & s, const std::string & search, const std::
s = std::move(builder);
}
std::string regex_escape(const std::string & s) {
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
return std::regex_replace(s, special_chars, "\\$0");
}
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
std::ostringstream result;
for (size_t i = 0; i < values.size(); ++i) {
@ -865,7 +830,7 @@ std::string fs_get_cache_directory() {
if (getenv("LLAMA_CACHE")) {
cache_directory = std::getenv("LLAMA_CACHE");
} else {
#ifdef __linux__
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME");
} else {
@ -875,7 +840,9 @@ std::string fs_get_cache_directory() {
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
#elif defined(_WIN32)
cache_directory = std::getenv("LOCALAPPDATA");
#endif // __linux__
#else
# error Unknown architecture
#endif
cache_directory = ensure_trailing_slash(cache_directory);
cache_directory += "llama.cpp";
}
@ -896,22 +863,14 @@ std::string fs_get_cache_file(const std::string & filename) {
//
// Model utils
//
struct common_init_result common_init_from_params(common_params & params) {
common_init_result iparams;
auto mparams = common_model_params_to_llama(params);
llama_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
} else if (!params.model_url.empty()) {
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
} else {
model = llama_model_load_from_file(params.model.c_str(), mparams);
}
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
return iparams;
}
@ -946,13 +905,13 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
llama_model_free(model);
return iparams;
}
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
params.ctx_shift = false;
}
@ -1029,6 +988,8 @@ struct common_init_result common_init_from_params(common_params & params) {
if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
llama_set_warmup(lctx, true);
std::vector<llama_token> tmp;
llama_token bos = llama_vocab_bos(vocab);
llama_token eos = llama_vocab_eos(vocab);
@ -1056,9 +1017,10 @@ struct common_init_result common_init_from_params(common_params & params) {
if (llama_model_has_decoder(model)) {
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
}
llama_kv_cache_clear(lctx);
llama_kv_self_clear(lctx);
llama_synchronize(lctx);
llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false);
}
iparams.model.reset(model);
@ -1067,6 +1029,19 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}
std::string get_model_endpoint() {
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
std::string model_endpoint = "https://huggingface.co/";
if (endpoint_env) {
model_endpoint = endpoint_env;
if (model_endpoint.back() != '/') model_endpoint += '/';
}
return model_endpoint;
}
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
llama_clear_adapter_lora(ctx);
for (auto & la : lora) {
@ -1082,15 +1057,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
if (!params.devices.empty()) {
mparams.devices = params.devices.data();
}
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
} else {
@ -1098,6 +1076,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.kv_overrides = params.kv_overrides.data();
}
if (params.tensor_buft_overrides.empty()) {
mparams.tensor_buft_overrides = NULL;
} else {
GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
}
return mparams;
}
@ -1157,451 +1142,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
return tpp;
}
#ifdef LLAMA_USE_CURL
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;
while (remaining_attempts > 0) {
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
CURLcode res = curl_easy_perform(curl);
if (res == CURLE_OK) {
return true;
}
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
remaining_attempts--;
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
}
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
return false;
}
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
if (!curl) {
LOG_ERR("%s: error initializing libcurl\n", __func__);
return false;
}
bool force_download = false;
// Set the URL, allow to follow http redirection
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
// Check if hf-token or bearer-token was specified
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer " + hf_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
}
#if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows.
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
// Check if the file already exists locally
auto file_exists = std::filesystem::exists(path);
// If the file exists, check its JSON metadata companion file.
std::string metadata_path = path + ".json";
nlohmann::json metadata;
std::string etag;
std::string last_modified;
if (file_exists) {
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
std::ifstream metadata_in(metadata_path);
if (metadata_in.good()) {
try {
metadata_in >> metadata;
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("url") && metadata.at("url").is_string()) {
auto previous_url = metadata.at("url").get<std::string>();
if (previous_url != url) {
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
return false;
}
}
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
etag = metadata.at("etag");
}
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
last_modified = metadata.at("lastModified");
}
} catch (const nlohmann::json::exception & e) {
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
return false;
}
}
} else {
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
}
// Send a HEAD request to retrieve the etag and last-modified headers
struct common_load_model_from_url_headers {
std::string etag;
std::string last_modified;
};
common_load_model_from_url_headers headers;
{
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase);
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
std::string header(buffer, n_items);
std::smatch match;
if (std::regex_match(header, match, header_regex)) {
const std::string & key = match[1];
const std::string & value = match[2];
if (std::regex_match(key, match, etag_regex)) {
headers->etag = value;
} else if (std::regex_match(key, match, last_modified_regex)) {
headers->last_modified = value;
}
}
return n_items;
};
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) {
return false;
}
long http_code = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code != 200) {
// HEAD not supported, we don't know if the file has changed
// force trigger downloading
force_download = true;
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
}
}
bool should_download = !file_exists || force_download;
if (!should_download) {
if (!etag.empty() && etag != headers.etag) {
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
should_download = true;
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
should_download = true;
}
}
if (should_download) {
std::string path_temporary = path + ".downloadInProgress";
if (file_exists) {
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return false;
}
}
// Set the output file
struct FILE_deleter {
void operator()(FILE * f) const {
fclose(f);
}
};
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
if (!outfile) {
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
return false;
}
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
return fwrite(data, size, nmemb, (FILE *)fd);
};
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
// display download progress
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
// helper function to hide password in URL
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
std::size_t protocol_pos = url.find("://");
if (protocol_pos == std::string::npos) {
return url; // Malformed URL
}
std::size_t at_pos = url.find('@', protocol_pos + 3);
if (at_pos == std::string::npos) {
return url; // No password in URL
}
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
};
// start the download
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) {
return false;
}
long http_code = 0;
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code < 200 || http_code >= 400) {
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
return false;
}
// Causes file to be closed explicitly here before we rename it.
outfile.reset();
// Write the updated JSON metadata file.
metadata.update({
{"url", url},
{"etag", headers.etag},
{"lastModified", headers.last_modified}
});
std::ofstream(metadata_path) << metadata.dump(4);
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return false;
}
}
return true;
}
struct llama_model * common_load_model_from_url(
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params) {
// Basic validation of the model_url
if (model_url.empty()) {
LOG_ERR("%s: invalid model_url\n", __func__);
return NULL;
}
if (!common_download_file(model_url, local_path, hf_token)) {
return NULL;
}
// check for additional GGUFs split to download
int n_split = 0;
{
struct gguf_init_params gguf_params = {
/*.no_alloc = */ true,
/*.ctx = */ NULL,
};
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
if (!ctx_gguf) {
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
return NULL;
}
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
if (key_n_split >= 0) {
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
}
gguf_free(ctx_gguf);
}
if (n_split > 1) {
char split_prefix[PATH_MAX] = {0};
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
// Verify the first split file format
// and extract split URL and PATH prefixes
{
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
return NULL;
}
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
return NULL;
}
}
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
for (int idx = 1; idx < n_split; idx++) {
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
return common_download_file(split_url, split_path, hf_token);
}, idx));
}
// Wait for all downloads to complete
for (auto & f : futures_download) {
if (!f.get()) {
return NULL;
}
}
}
return llama_model_load_from_file(local_path.c_str(), params);
}
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params) {
// construct hugging face model url:
//
// --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
// https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
//
// --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
// https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
//
std::string model_url = "https://huggingface.co/";
model_url += repo;
model_url += "/resolve/main/";
model_url += remote_path;
return common_load_model_from_url(model_url, local_path, hf_token, params);
}
/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
*/
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}
// fetch model info from Hugging Face Hub API
json model_info;
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
std::string res_str;
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
return size * nmemb;
};
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
#if defined(_WIN32)
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer " + hf_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
throw std::runtime_error("error: cannot make GET request to HF API");
}
long res_code;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
if (res_code == 200) {
model_info = json::parse(res_str);
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
}
// check response
if (!model_info.contains("ggufFile")) {
throw std::runtime_error("error: model does not have ggufFile");
}
json & gguf_file = model_info.at("ggufFile");
if (!gguf_file.contains("rfilename")) {
throw std::runtime_error("error: ggufFile does not have rfilename");
}
return std::make_pair(hf_repo, gguf_file.at("rfilename"));
}
#else
struct llama_model * common_load_model_from_url(
const std::string & /*model_url*/,
const std::string & /*local_path*/,
const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr;
}
struct llama_model * common_load_model_from_hf(
const std::string & /*repo*/,
const std::string & /*remote_path*/,
const std::string & /*local_path*/,
const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr;
}
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return std::make_pair("", "");
}
#endif // LLAMA_USE_CURL
//
// Batch utils
//
@ -2025,4 +1565,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
return result;
}

View File

@ -110,9 +110,17 @@ enum common_conversation_mode {
COMMON_CONVERSATION_MODE_AUTO = 2,
};
enum common_grammar_trigger_type {
COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
};
struct common_grammar_trigger {
std::string word;
bool at_start;
common_grammar_trigger_type type;
std::string value;
llama_token token = LLAMA_TOKEN_NULL;
};
// sampling parameters
@ -163,8 +171,7 @@ struct common_params_sampling {
std::string grammar; // optional BNF-like grammar to constrain sampling
bool grammar_lazy = false;
std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
std::set<llama_token> preserved_tokens;
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
@ -173,6 +180,13 @@ struct common_params_sampling {
std::string print() const;
};
struct common_params_model {
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
};
struct common_params_speculative {
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
@ -186,19 +200,13 @@ struct common_params_speculative {
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string model = ""; // draft model for speculative decoding // NOLINT
std::string model_url = ""; // model url to download // NOLINT
struct common_params_model model;
};
struct common_params_vocoder {
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
struct common_params_model model;
std::string model = ""; // model path // NOLINT
std::string model_url = ""; // model url to download // NOLINT
std::string speaker_file = ""; // speaker file path // NOLINT
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
};
@ -254,13 +262,12 @@ struct common_params {
struct common_params_speculative speculative;
struct common_params_vocoder vocoder;
std::string model = ""; // model path // NOLINT
struct common_params_model model;
std::string model_alias = ""; // model alias // NOLINT
std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string prompt = ""; // NOLINT
std::string system_prompt = ""; // NOLINT
std::string prompt_file = ""; // store the external prompt file name // NOLINT
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
@ -272,6 +279,7 @@ struct common_params {
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@ -325,13 +333,17 @@ struct common_params {
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data
bool single_turn = false; // single turn chat conversation
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector // NOLINT
struct common_params_model mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)
// embedding
@ -391,8 +403,6 @@ struct common_params {
int32_t i_pos = -1; // position of the passkey in the junk text
// imatrix params
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk
@ -404,16 +414,16 @@ struct common_params {
int n_pca_batch = 100;
int n_pca_iterations = 1000;
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
std::string cvector_outfile = "control_vector.gguf";
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
bool spm_infill = false; // suffix/prefix/middle pattern for infill
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
// batched-bench params
bool batched_bench_output_jsonl = false;
// common params
std::string out_file; // output filename for all example programs
};
// call once at the start of a program if it uses libcommon
@ -453,6 +463,8 @@ std::string string_repeat(const std::string & str, size_t n);
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
std::string regex_escape(const std::string & s);
template<class T>
static std::vector<T> string_split(const std::string & str, char delim) {
static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
@ -530,26 +542,11 @@ struct llama_model_params common_model_params_to_llama ( common_params
struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * common_load_model_from_url(
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
std::pair<std::string, std::string> common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & hf_token);
// clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
std::string get_model_endpoint();
//
// Batch utils
//

View File

@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
auto has_max = max_items != std::numeric_limits<int>::max();
if (max_items == 0) {
return "";
}
if (min_items == 0 && max_items == 1) {
return item_rule + "?";
}
@ -264,7 +267,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
throw std::runtime_error("At least one of min_value or max_value must be set");
}
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}";
struct BuiltinRule {
std::string content;
@ -764,11 +767,10 @@ private:
public:
SchemaConverter(
const std::function<json(const std::string &)> & fetch_json,
bool dotall,
bool compact_spaces)
bool dotall)
: _fetch_json(fetch_json), _dotall(dotall)
{
_rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
_rules["space"] = SPACE_RULE;
}
void resolve_refs(json & schema, const std::string & url) {
@ -1007,7 +1009,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
}
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
common_grammar_builder builder {
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
return converter._add_rule(name, rule);

View File

@ -16,7 +16,6 @@ struct common_grammar_builder {
struct common_grammar_options {
bool dotall = false;
bool compact_spaces = false;
};
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});

View File

@ -11,25 +11,24 @@ struct llama_sampler_llg {
std::string grammar_kind;
std::string grammar_data;
LlgTokenizer * tokenizer;
LlgConstraint * grammar;
LlgMaskResult llg_res;
bool has_llg_res;
LlgMatcher * grammar;
};
static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
const char * grammar_data) {
static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
const char * grammar_data) {
LlgConstraintInit cinit;
llg_constraint_init_set_defaults(&cinit, tokenizer);
const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
if (log_level && *log_level) {
cinit.log_stderr_level = atoi(log_level);
}
auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
if (llg_get_error(c)) {
LOG_ERR("llg error: %s\n", llg_get_error(c));
llg_free_constraint(c);
auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
if (llg_matcher_get_error(c)) {
LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
llg_free_matcher(c);
return nullptr;
}
return c;
}
@ -40,39 +39,29 @@ static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
LlgCommitResult res;
llg_commit_token(ctx->grammar, token, &res);
ctx->has_llg_res = false;
llg_matcher_consume_token(ctx->grammar, token);
}
}
static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
if (!ctx->has_llg_res) {
if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
ctx->has_llg_res = true;
const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
if (mask == nullptr) {
if (llg_matcher_compute_mask(ctx->grammar) == 0) {
mask = llg_matcher_get_mask(ctx->grammar);
} else {
LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
llg_free_constraint(ctx->grammar);
LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
llg_free_matcher(ctx->grammar);
ctx->grammar = nullptr;
return;
}
}
if (ctx->has_llg_res) {
if (ctx->llg_res.is_stop) {
for (size_t i = 0; i < cur_p->size; ++i) {
if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
cur_p->data[i].logit = -INFINITY;
}
}
} else {
const uint32_t * mask = ctx->llg_res.sample_mask;
for (size_t i = 0; i < cur_p->size; ++i) {
auto token = cur_p->data[i].id;
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
cur_p->data[i].logit = -INFINITY;
}
}
for (size_t i = 0; i < cur_p->size; ++i) {
auto token = cur_p->data[i].id;
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
cur_p->data[i].logit = -INFINITY;
}
}
}
@ -80,14 +69,9 @@ static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array
static void llama_sampler_llg_reset(llama_sampler * smpl) {
auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (!ctx->grammar) {
return;
if (ctx->grammar) {
llg_matcher_reset(ctx->grammar);
}
auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
llg_free_constraint(ctx->grammar);
ctx->grammar = grammar_new;
ctx->has_llg_res = false;
}
static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
@ -102,7 +86,7 @@ static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
if (ctx->grammar) {
result_ctx->grammar_kind = ctx->grammar_kind;
result_ctx->grammar_data = ctx->grammar_data;
result_ctx->grammar = llg_clone_constraint(ctx->grammar);
result_ctx->grammar = llg_clone_matcher(ctx->grammar);
result_ctx->tokenizer = llg_clone_tokenizer(ctx->tokenizer);
}
}
@ -114,7 +98,7 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
const auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
llg_free_constraint(ctx->grammar);
llg_free_matcher(ctx->grammar);
llg_free_tokenizer(ctx->tokenizer);
}
@ -239,9 +223,11 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
/* .grammar_data = */ grammar_data,
/* .tokenizer = */ tokenizer,
/* .grammar = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
/* .llg_res = */ {},
/* .has_llg_res = */ false,
};
if (ctx->grammar) {
GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
llg_matcher_get_mask_byte_size(ctx->grammar));
}
} else {
*ctx = {
/* .vocab = */ vocab,
@ -249,15 +235,12 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
/* .grammar_data = */ {},
/* .tokenizer = */ nullptr,
/* .grammar = */ nullptr,
/* .llg_res = */ {},
/* .has_llg_res = */ false,
};
}
return llama_sampler_init(
/* .iface = */ &llama_sampler_llg_i,
/* .ctx = */ ctx
);
/* .ctx = */ ctx);
}
#else

View File

@ -9,10 +9,19 @@
#pragma once
#include "minja.hpp"
#include <json.hpp>
#include <chrono>
#include <cstddef>
#include <cstdio>
#include <exception>
#include <iomanip>
#include <memory>
#include <sstream>
#include <string>
#include <vector>
#include <json.hpp>
using json = nlohmann::ordered_json;
namespace minja {
@ -425,7 +434,7 @@ class chat_template {
auto obj = json {
{"tool_calls", tool_calls},
};
if (!content.is_null() && content != "") {
if (!content.is_null() && !content.empty()) {
obj["content"] = content;
}
message["content"] = obj.dump(2);
@ -435,13 +444,12 @@ class chat_template {
if (polyfill_tool_responses && role == "tool") {
message["role"] = "user";
auto obj = json {
{"tool_response", {
{"content", message.at("content")},
}},
{"tool_response", json::object()},
};
if (message.contains("name")) {
obj["tool_response"]["name"] = message.at("name");
obj["tool_response"]["tool"] = message.at("name");
}
obj["tool_response"]["content"] = message.at("content");
if (message.contains("tool_call_id")) {
obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
}
@ -510,7 +518,7 @@ class chat_template {
static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
json messages_with_system = messages;
if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
if (!messages_with_system.empty() && messages_with_system[0].at("role") == "system") {
std::string existing_system = messages_with_system.at(0).at("content");
messages_with_system[0] = json {
{"role", "system"},

View File

@ -8,14 +8,26 @@
// SPDX-License-Identifier: MIT
#pragma once
#include <algorithm>
#include <cctype>
#include <cstddef>
#include <cmath>
#include <exception>
#include <functional>
#include <iostream>
#include <string>
#include <vector>
#include <regex>
#include <iterator>
#include <limits>
#include <map>
#include <memory>
#include <stdexcept>
#include <regex>
#include <sstream>
#include <string>
#include <stdexcept>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <json.hpp>
using json = nlohmann::ordered_json;
@ -731,51 +743,51 @@ public:
struct TextTemplateToken : public TemplateToken {
std::string text;
TextTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Text, location, pre, post), text(t) {}
TextTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Text, loc, pre, post), text(t) {}
};
struct ExpressionTemplateToken : public TemplateToken {
std::shared_ptr<Expression> expr;
ExpressionTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && e) : TemplateToken(Type::Expression, location, pre, post), expr(std::move(e)) {}
ExpressionTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && e) : TemplateToken(Type::Expression, loc, pre, post), expr(std::move(e)) {}
};
struct IfTemplateToken : public TemplateToken {
std::shared_ptr<Expression> condition;
IfTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::If, location, pre, post), condition(std::move(c)) {}
IfTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::If, loc, pre, post), condition(std::move(c)) {}
};
struct ElifTemplateToken : public TemplateToken {
std::shared_ptr<Expression> condition;
ElifTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::Elif, location, pre, post), condition(std::move(c)) {}
ElifTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::Elif, loc, pre, post), condition(std::move(c)) {}
};
struct ElseTemplateToken : public TemplateToken {
ElseTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Else, location, pre, post) {}
ElseTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Else, loc, pre, post) {}
};
struct EndIfTemplateToken : public TemplateToken {
EndIfTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndIf, location, pre, post) {}
EndIfTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndIf, loc, pre, post) {}
};
struct MacroTemplateToken : public TemplateToken {
std::shared_ptr<VariableExpr> name;
Expression::Parameters params;
MacroTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p)
: TemplateToken(Type::Macro, location, pre, post), name(std::move(n)), params(std::move(p)) {}
MacroTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p)
: TemplateToken(Type::Macro, loc, pre, post), name(std::move(n)), params(std::move(p)) {}
};
struct EndMacroTemplateToken : public TemplateToken {
EndMacroTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndMacro, location, pre, post) {}
EndMacroTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndMacro, loc, pre, post) {}
};
struct FilterTemplateToken : public TemplateToken {
std::shared_ptr<Expression> filter;
FilterTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && filter)
: TemplateToken(Type::Filter, location, pre, post), filter(std::move(filter)) {}
FilterTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && filter)
: TemplateToken(Type::Filter, loc, pre, post), filter(std::move(filter)) {}
};
struct EndFilterTemplateToken : public TemplateToken {
EndFilterTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFilter, location, pre, post) {}
EndFilterTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFilter, loc, pre, post) {}
};
struct ForTemplateToken : public TemplateToken {
@ -783,38 +795,38 @@ struct ForTemplateToken : public TemplateToken {
std::shared_ptr<Expression> iterable;
std::shared_ptr<Expression> condition;
bool recursive;
ForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::vector<std::string> & vns, std::shared_ptr<Expression> && iter,
ForTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::vector<std::string> & vns, std::shared_ptr<Expression> && iter,
std::shared_ptr<Expression> && c, bool r)
: TemplateToken(Type::For, location, pre, post), var_names(vns), iterable(std::move(iter)), condition(std::move(c)), recursive(r) {}
: TemplateToken(Type::For, loc, pre, post), var_names(vns), iterable(std::move(iter)), condition(std::move(c)), recursive(r) {}
};
struct EndForTemplateToken : public TemplateToken {
EndForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFor, location, pre, post) {}
EndForTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFor, loc, pre, post) {}
};
struct GenerationTemplateToken : public TemplateToken {
GenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Generation, location, pre, post) {}
GenerationTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Generation, loc, pre, post) {}
};
struct EndGenerationTemplateToken : public TemplateToken {
EndGenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndGeneration, location, pre, post) {}
EndGenerationTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndGeneration, loc, pre, post) {}
};
struct SetTemplateToken : public TemplateToken {
std::string ns;
std::vector<std::string> var_names;
std::shared_ptr<Expression> value;
SetTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
: TemplateToken(Type::Set, location, pre, post), ns(ns), var_names(vns), value(std::move(v)) {}
SetTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
: TemplateToken(Type::Set, loc, pre, post), ns(ns), var_names(vns), value(std::move(v)) {}
};
struct EndSetTemplateToken : public TemplateToken {
EndSetTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndSet, location, pre, post) {}
EndSetTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndSet, loc, pre, post) {}
};
struct CommentTemplateToken : public TemplateToken {
std::string text;
CommentTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Comment, location, pre, post), text(t) {}
CommentTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Comment, loc, pre, post), text(t) {}
};
enum class LoopControlType { Break, Continue };
@ -830,7 +842,7 @@ public:
struct LoopControlTemplateToken : public TemplateToken {
LoopControlType control_type;
LoopControlTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, LoopControlType control_type) : TemplateToken(Type::Break, location, pre, post), control_type(control_type) {}
LoopControlTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, LoopControlType control_type) : TemplateToken(Type::Break, loc, pre, post), control_type(control_type) {}
};
class TemplateNode {
@ -868,8 +880,8 @@ public:
class SequenceNode : public TemplateNode {
std::vector<std::shared_ptr<TemplateNode>> children;
public:
SequenceNode(const Location & location, std::vector<std::shared_ptr<TemplateNode>> && c)
: TemplateNode(location), children(std::move(c)) {}
SequenceNode(const Location & loc, std::vector<std::shared_ptr<TemplateNode>> && c)
: TemplateNode(loc), children(std::move(c)) {}
void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
for (const auto& child : children) child->render(out, context);
}
@ -878,7 +890,7 @@ public:
class TextNode : public TemplateNode {
std::string text;
public:
TextNode(const Location & location, const std::string& t) : TemplateNode(location), text(t) {}
TextNode(const Location & loc, const std::string& t) : TemplateNode(loc), text(t) {}
void do_render(std::ostringstream & out, const std::shared_ptr<Context> &) const override {
out << text;
}
@ -887,7 +899,7 @@ public:
class ExpressionNode : public TemplateNode {
std::shared_ptr<Expression> expr;
public:
ExpressionNode(const Location & location, std::shared_ptr<Expression> && e) : TemplateNode(location), expr(std::move(e)) {}
ExpressionNode(const Location & loc, std::shared_ptr<Expression> && e) : TemplateNode(loc), expr(std::move(e)) {}
void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
if (!expr) throw std::runtime_error("ExpressionNode.expr is null");
auto result = expr->evaluate(context);
@ -904,8 +916,8 @@ public:
class IfNode : public TemplateNode {
std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> cascade;
public:
IfNode(const Location & location, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> && c)
: TemplateNode(location), cascade(std::move(c)) {}
IfNode(const Location & loc, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> && c)
: TemplateNode(loc), cascade(std::move(c)) {}
void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
for (const auto& branch : cascade) {
auto enter_branch = true;
@ -924,7 +936,7 @@ public:
class LoopControlNode : public TemplateNode {
LoopControlType control_type_;
public:
LoopControlNode(const Location & location, LoopControlType control_type) : TemplateNode(location), control_type_(control_type) {}
LoopControlNode(const Location & loc, LoopControlType control_type) : TemplateNode(loc), control_type_(control_type) {}
void do_render(std::ostringstream &, const std::shared_ptr<Context> &) const override {
throw LoopControlException(control_type_);
}
@ -938,9 +950,9 @@ class ForNode : public TemplateNode {
bool recursive;
std::shared_ptr<TemplateNode> else_body;
public:
ForNode(const Location & location, std::vector<std::string> && var_names, std::shared_ptr<Expression> && iterable,
ForNode(const Location & loc, std::vector<std::string> && var_names, std::shared_ptr<Expression> && iterable,
std::shared_ptr<Expression> && condition, std::shared_ptr<TemplateNode> && body, bool recursive, std::shared_ptr<TemplateNode> && else_body)
: TemplateNode(location), var_names(var_names), iterable(std::move(iterable)), condition(std::move(condition)), body(std::move(body)), recursive(recursive), else_body(std::move(else_body)) {}
: TemplateNode(loc), var_names(var_names), iterable(std::move(iterable)), condition(std::move(condition)), body(std::move(body)), recursive(recursive), else_body(std::move(else_body)) {}
void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
// https://jinja.palletsprojects.com/en/3.0.x/templates/#for
@ -1025,8 +1037,8 @@ class MacroNode : public TemplateNode {
std::shared_ptr<TemplateNode> body;
std::unordered_map<std::string, size_t> named_param_positions;
public:
MacroNode(const Location & location, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p, std::shared_ptr<TemplateNode> && b)
: TemplateNode(location), name(std::move(n)), params(std::move(p)), body(std::move(b)) {
MacroNode(const Location & loc, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p, std::shared_ptr<TemplateNode> && b)
: TemplateNode(loc), name(std::move(n)), params(std::move(p)), body(std::move(b)) {
for (size_t i = 0; i < params.size(); ++i) {
const auto & name = params[i].first;
if (!name.empty()) {
@ -1072,8 +1084,8 @@ class FilterNode : public TemplateNode {
std::shared_ptr<TemplateNode> body;
public:
FilterNode(const Location & location, std::shared_ptr<Expression> && f, std::shared_ptr<TemplateNode> && b)
: TemplateNode(location), filter(std::move(f)), body(std::move(b)) {}
FilterNode(const Location & loc, std::shared_ptr<Expression> && f, std::shared_ptr<TemplateNode> && b)
: TemplateNode(loc), filter(std::move(f)), body(std::move(b)) {}
void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
if (!filter) throw std::runtime_error("FilterNode.filter is null");
@ -1095,8 +1107,8 @@ class SetNode : public TemplateNode {
std::vector<std::string> var_names;
std::shared_ptr<Expression> value;
public:
SetNode(const Location & location, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
: TemplateNode(location), ns(ns), var_names(vns), value(std::move(v)) {}
SetNode(const Location & loc, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
: TemplateNode(loc), ns(ns), var_names(vns), value(std::move(v)) {}
void do_render(std::ostringstream &, const std::shared_ptr<Context> & context) const override {
if (!value) throw std::runtime_error("SetNode.value is null");
if (!ns.empty()) {
@ -1118,8 +1130,8 @@ class SetTemplateNode : public TemplateNode {
std::string name;
std::shared_ptr<TemplateNode> template_value;
public:
SetTemplateNode(const Location & location, const std::string & name, std::shared_ptr<TemplateNode> && tv)
: TemplateNode(location), name(name), template_value(std::move(tv)) {}
SetTemplateNode(const Location & loc, const std::string & name, std::shared_ptr<TemplateNode> && tv)
: TemplateNode(loc), name(name), template_value(std::move(tv)) {}
void do_render(std::ostringstream &, const std::shared_ptr<Context> & context) const override {
if (!template_value) throw std::runtime_error("SetTemplateNode.template_value is null");
Value value { template_value->render(context) };
@ -1132,8 +1144,8 @@ class IfExpr : public Expression {
std::shared_ptr<Expression> then_expr;
std::shared_ptr<Expression> else_expr;
public:
IfExpr(const Location & location, std::shared_ptr<Expression> && c, std::shared_ptr<Expression> && t, std::shared_ptr<Expression> && e)
: Expression(location), condition(std::move(c)), then_expr(std::move(t)), else_expr(std::move(e)) {}
IfExpr(const Location & loc, std::shared_ptr<Expression> && c, std::shared_ptr<Expression> && t, std::shared_ptr<Expression> && e)
: Expression(loc), condition(std::move(c)), then_expr(std::move(t)), else_expr(std::move(e)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!condition) throw std::runtime_error("IfExpr.condition is null");
if (!then_expr) throw std::runtime_error("IfExpr.then_expr is null");
@ -1150,16 +1162,16 @@ public:
class LiteralExpr : public Expression {
Value value;
public:
LiteralExpr(const Location & location, const Value& v)
: Expression(location), value(v) {}
LiteralExpr(const Location & loc, const Value& v)
: Expression(loc), value(v) {}
Value do_evaluate(const std::shared_ptr<Context> &) const override { return value; }
};
class ArrayExpr : public Expression {
std::vector<std::shared_ptr<Expression>> elements;
public:
ArrayExpr(const Location & location, std::vector<std::shared_ptr<Expression>> && e)
: Expression(location), elements(std::move(e)) {}
ArrayExpr(const Location & loc, std::vector<std::shared_ptr<Expression>> && e)
: Expression(loc), elements(std::move(e)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
auto result = Value::array();
for (const auto& e : elements) {
@ -1173,8 +1185,8 @@ public:
class DictExpr : public Expression {
std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> elements;
public:
DictExpr(const Location & location, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> && e)
: Expression(location), elements(std::move(e)) {}
DictExpr(const Location & loc, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> && e)
: Expression(loc), elements(std::move(e)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
auto result = Value::object();
for (const auto& [key, value] : elements) {
@ -1189,8 +1201,8 @@ public:
class SliceExpr : public Expression {
public:
std::shared_ptr<Expression> start, end;
SliceExpr(const Location & location, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
: Expression(location), start(std::move(s)), end(std::move(e)) {}
SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
: Expression(loc), start(std::move(s)), end(std::move(e)) {}
Value do_evaluate(const std::shared_ptr<Context> &) const override {
throw std::runtime_error("SliceExpr not implemented");
}
@ -1200,8 +1212,8 @@ class SubscriptExpr : public Expression {
std::shared_ptr<Expression> base;
std::shared_ptr<Expression> index;
public:
SubscriptExpr(const Location & location, std::shared_ptr<Expression> && b, std::shared_ptr<Expression> && i)
: Expression(location), base(std::move(b)), index(std::move(i)) {}
SubscriptExpr(const Location & loc, std::shared_ptr<Expression> && b, std::shared_ptr<Expression> && i)
: Expression(loc), base(std::move(b)), index(std::move(i)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!base) throw std::runtime_error("SubscriptExpr.base is null");
if (!index) throw std::runtime_error("SubscriptExpr.index is null");
@ -1243,8 +1255,8 @@ public:
enum class Op { Plus, Minus, LogicalNot, Expansion, ExpansionDict };
std::shared_ptr<Expression> expr;
Op op;
UnaryOpExpr(const Location & location, std::shared_ptr<Expression> && e, Op o)
: Expression(location), expr(std::move(e)), op(o) {}
UnaryOpExpr(const Location & loc, std::shared_ptr<Expression> && e, Op o)
: Expression(loc), expr(std::move(e)), op(o) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!expr) throw std::runtime_error("UnaryOpExpr.expr is null");
auto e = expr->evaluate(context);
@ -1269,8 +1281,8 @@ private:
std::shared_ptr<Expression> right;
Op op;
public:
BinaryOpExpr(const Location & location, std::shared_ptr<Expression> && l, std::shared_ptr<Expression> && r, Op o)
: Expression(location), left(std::move(l)), right(std::move(r)), op(o) {}
BinaryOpExpr(const Location & loc, std::shared_ptr<Expression> && l, std::shared_ptr<Expression> && r, Op o)
: Expression(loc), left(std::move(l)), right(std::move(r)), op(o) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!left) throw std::runtime_error("BinaryOpExpr.left is null");
if (!right) throw std::runtime_error("BinaryOpExpr.right is null");
@ -1378,13 +1390,27 @@ struct ArgumentsExpression {
}
};
static std::string strip(const std::string & s) {
auto start = s.find_first_not_of(" \t\n\r");
static std::string strip(const std::string & s, const std::string & chars = "", bool left = true, bool right = true) {
auto charset = chars.empty() ? " \t\n\r" : chars;
auto start = left ? s.find_first_not_of(charset) : 0;
if (start == std::string::npos) return "";
auto end = s.find_last_not_of(" \t\n\r");
auto end = right ? s.find_last_not_of(charset) : s.size() - 1;
return s.substr(start, end - start + 1);
}
static std::vector<std::string> split(const std::string & s, const std::string & sep) {
std::vector<std::string> result;
size_t start = 0;
size_t end = s.find(sep);
while (end != std::string::npos) {
result.push_back(s.substr(start, end - start));
start = end + sep.length();
end = s.find(sep, start);
}
result.push_back(s.substr(start));
return result;
}
static std::string capitalize(const std::string & s) {
if (s.empty()) return s;
auto result = s;
@ -1413,8 +1439,8 @@ class MethodCallExpr : public Expression {
std::shared_ptr<VariableExpr> method;
ArgumentsExpression args;
public:
MethodCallExpr(const Location & location, std::shared_ptr<Expression> && obj, std::shared_ptr<VariableExpr> && m, ArgumentsExpression && a)
: Expression(location), object(std::move(obj)), method(std::move(m)), args(std::move(a)) {}
MethodCallExpr(const Location & loc, std::shared_ptr<Expression> && obj, std::shared_ptr<VariableExpr> && m, ArgumentsExpression && a)
: Expression(loc), object(std::move(obj)), method(std::move(m)), args(std::move(a)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!object) throw std::runtime_error("MethodCallExpr.object is null");
if (!method) throw std::runtime_error("MethodCallExpr.method is null");
@ -1467,8 +1493,26 @@ public:
} else if (obj.is_string()) {
auto str = obj.get<std::string>();
if (method->get_name() == "strip") {
vargs.expectArgs("strip method", {0, 0}, {0, 0});
return Value(strip(str));
vargs.expectArgs("strip method", {0, 1}, {0, 0});
auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
return Value(strip(str, chars));
} else if (method->get_name() == "lstrip") {
vargs.expectArgs("lstrip method", {0, 1}, {0, 0});
auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
return Value(strip(str, chars, /* left= */ true, /* right= */ false));
} else if (method->get_name() == "rstrip") {
vargs.expectArgs("rstrip method", {0, 1}, {0, 0});
auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
return Value(strip(str, chars, /* left= */ false, /* right= */ true));
} else if (method->get_name() == "split") {
vargs.expectArgs("split method", {1, 1}, {0, 0});
auto sep = vargs.args[0].get<std::string>();
auto parts = split(str, sep);
Value result = Value::array();
for (const auto& part : parts) {
result.push_back(Value(part));
}
return result;
} else if (method->get_name() == "capitalize") {
vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
return Value(capitalize(str));
@ -1494,8 +1538,8 @@ class CallExpr : public Expression {
public:
std::shared_ptr<Expression> object;
ArgumentsExpression args;
CallExpr(const Location & location, std::shared_ptr<Expression> && obj, ArgumentsExpression && a)
: Expression(location), object(std::move(obj)), args(std::move(a)) {}
CallExpr(const Location & loc, std::shared_ptr<Expression> && obj, ArgumentsExpression && a)
: Expression(loc), object(std::move(obj)), args(std::move(a)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!object) throw std::runtime_error("CallExpr.object is null");
auto obj = object->evaluate(context);
@ -1510,8 +1554,8 @@ public:
class FilterExpr : public Expression {
std::vector<std::shared_ptr<Expression>> parts;
public:
FilterExpr(const Location & location, std::vector<std::shared_ptr<Expression>> && p)
: Expression(location), parts(std::move(p)) {}
FilterExpr(const Location & loc, std::vector<std::shared_ptr<Expression>> && p)
: Expression(loc), parts(std::move(p)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
Value result;
bool first = true;
@ -2428,7 +2472,7 @@ private:
static std::regex leading_space_regex(R"(^\s+)");
text = std::regex_replace(text, leading_space_regex, "");
} else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
if (text.length() > 0 && text[0] == '\n') {
if (!text.empty() && text[0] == '\n') {
text.erase(0, 1);
}
}
@ -2506,7 +2550,7 @@ public:
TemplateTokenIterator begin = tokens.begin();
auto it = begin;
TemplateTokenIterator end = tokens.end();
return parser.parseTemplate(begin, it, end, /* full= */ true);
return parser.parseTemplate(begin, it, end, /* fully= */ true);
}
};
@ -2545,7 +2589,7 @@ inline std::shared_ptr<Context> Context::builtins() {
throw std::runtime_error(args.at("message").get<std::string>());
}));
globals.set("tojson", simple_function("tojson", { "value", "indent" }, [](const std::shared_ptr<Context> &, Value & args) {
return Value(args.at("value").dump(args.get<int64_t>("indent", -1), /* tojson= */ true));
return Value(args.at("value").dump(args.get<int64_t>("indent", -1), /* to_json= */ true));
}));
globals.set("items", simple_function("items", { "object" }, [](const std::shared_ptr<Context> &, Value & args) {
auto items = Value::array();
@ -2567,21 +2611,25 @@ inline std::shared_ptr<Context> Context::builtins() {
globals.set("last", simple_function("last", { "items" }, [](const std::shared_ptr<Context> &, Value & args) {
auto items = args.at("items");
if (!items.is_array()) throw std::runtime_error("object is not a list");
if (items.size() == 0) return Value();
if (items.empty()) return Value();
return items.at(items.size() - 1);
}));
globals.set("trim", simple_function("trim", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
auto & text = args.at("text");
return text.is_null() ? text : Value(strip(text.get<std::string>()));
}));
globals.set("lower", simple_function("lower", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
auto text = args.at("text");
if (text.is_null()) return text;
std::string res;
auto str = text.get<std::string>();
std::transform(str.begin(), str.end(), std::back_inserter(res), ::tolower);
return Value(res);
}));
auto char_transform_function = [](const std::string & name, const std::function<char(char)> & fn) {
return simple_function(name, { "text" }, [=](const std::shared_ptr<Context> &, Value & args) {
auto text = args.at("text");
if (text.is_null()) return text;
std::string res;
auto str = text.get<std::string>();
std::transform(str.begin(), str.end(), std::back_inserter(res), fn);
return Value(res);
});
};
globals.set("lower", char_transform_function("lower", ::tolower));
globals.set("upper", char_transform_function("upper", ::toupper));
globals.set("default", Value::callable([=](const std::shared_ptr<Context> &, ArgumentsValue & args) {
args.expectArgs("default", {2, 3}, {0, 1});
auto & value = args.args[0];
@ -2711,12 +2759,17 @@ inline std::shared_ptr<Context> Context::builtins() {
return Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
args.expectArgs(is_select ? "select" : "reject", {2, (std::numeric_limits<size_t>::max)()}, {0, 0});
auto & items = args.args[0];
if (items.is_null())
if (items.is_null()) {
return Value::array();
if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump());
}
if (!items.is_array()) {
throw std::runtime_error("object is not iterable: " + items.dump());
}
auto filter_fn = context->get(args.args[1]);
if (filter_fn.is_null()) throw std::runtime_error("Undefined filter: " + args.args[1].dump());
if (filter_fn.is_null()) {
throw std::runtime_error("Undefined filter: " + args.args[1].dump());
}
auto filter_args = Value::array();
for (size_t i = 2, n = args.args.size(); i < n; i++) {
@ -2838,20 +2891,25 @@ inline std::shared_ptr<Context> Context::builtins() {
auto v = arg.get<int64_t>();
startEndStep[i] = v;
param_set[i] = true;
}
}
for (auto & [name, value] : args.kwargs) {
size_t i;
if (name == "start") i = 0;
else if (name == "end") i = 1;
else if (name == "step") i = 2;
else throw std::runtime_error("Unknown argument " + name + " for function range");
}
for (auto & [name, value] : args.kwargs) {
size_t i;
if (name == "start") {
i = 0;
} else if (name == "end") {
i = 1;
} else if (name == "step") {
i = 2;
} else {
throw std::runtime_error("Unknown argument " + name + " for function range");
}
if (param_set[i]) {
throw std::runtime_error("Duplicate argument " + name + " for function range");
}
startEndStep[i] = value.get<int64_t>();
param_set[i] = true;
if (param_set[i]) {
throw std::runtime_error("Duplicate argument " + name + " for function range");
}
startEndStep[i] = value.get<int64_t>();
param_set[i] = true;
}
if (!param_set[1]) {
throw std::runtime_error("Missing required argument 'end' for function range");

View File

@ -7,6 +7,7 @@
#include <cstdio>
#include <fstream>
#include <thread>
#include <algorithm>
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
std::vector<llama_token> & inp, int nnew, bool print_progress) {

View File

@ -4,6 +4,7 @@
#include <cmath>
#include <unordered_map>
#include <algorithm>
// the ring buffer works similarly to std::deque, but with a fixed capacity
// TODO: deduplicate with llama-impl.h
@ -159,17 +160,57 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
#endif // LLAMA_USE_LLGUIDANCE
} else {
std::vector<const char *> trigger_words;
trigger_words.reserve(params.grammar_trigger_words.size());
for (const auto & str : params.grammar_trigger_words) {
trigger_words.push_back(str.word.c_str());
std::vector<std::string> patterns_at_start;
std::vector<std::string> patterns_anywhere;
std::vector<llama_token> trigger_tokens;
for (const auto & trigger : params.grammar_triggers) {
switch (trigger.type) {
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
{
const auto & word = trigger.value;
patterns_anywhere.push_back(regex_escape(word));
break;
}
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
{
const auto & pattern = trigger.value;
(trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
break;
}
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
{
const auto token = trigger.token;
trigger_tokens.push_back(token);
break;
}
default:
GGML_ASSERT(false && "unknown trigger type");
}
}
std::vector<std::string> trigger_patterns;
if (!patterns_at_start.empty()) {
trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
}
if (!patterns_anywhere.empty()) {
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
}
std::vector<const char *> trigger_patterns_c;
trigger_patterns_c.reserve(trigger_patterns.size());
for (const auto & regex : trigger_patterns) {
trigger_patterns_c.push_back(regex.c_str());
}
grmr = params.grammar_lazy
? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
trigger_words.data(), trigger_words.size(),
params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
trigger_patterns_c.data(), trigger_patterns_c.size(),
trigger_tokens.data(), trigger_tokens.size())
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
if (!grmr) {
return nullptr;
}
}
auto * result = new common_sampler {

View File

@ -5,6 +5,7 @@
#include "sampling.h"
#include <cstring>
#include <algorithm>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@ -172,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
result.reserve(params.n_draft);
if (reuse_n == 0) {
llama_kv_cache_clear(ctx);
llama_kv_self_clear(ctx);
prompt.clear();
} else {
@ -191,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
}
if (reuse_i > 0) {
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
}
if (reuse_n < (int) prompt.size()) {
llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
prompt.erase(prompt.begin() + reuse_n, prompt.end());
}

File diff suppressed because it is too large Load Diff

View File

@ -109,6 +109,13 @@ models = [
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
]
@ -131,6 +138,10 @@ def download_model(model):
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
if name == "gpt-4o":
# Xenova/gpt-4o is tokenizer-only, it does not contain config.json
files = ["tokenizer.json", "tokenizer_config.json"]
if tokt == TOKENIZER_TYPE.SPM:
files.append("tokenizer.model")

View File

@ -24,7 +24,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
import gguf
# reuse model definitions from convert_hf_to_gguf.py
from convert_hf_to_gguf import LazyTorchTensor, Model
from convert_hf_to_gguf import LazyTorchTensor, ModelBase
logger = logging.getLogger("lora-to-gguf")
@ -340,11 +340,11 @@ if __name__ == '__main__':
sys.exit(1)
else:
logger.info(f"Loading base model: {dir_base_model.name}")
hparams = Model.load_hparams(dir_base_model)
hparams = ModelBase.load_hparams(dir_base_model)
with torch.inference_mode():
try:
model_class = Model.from_model_architecture(hparams["architectures"][0])
model_class = ModelBase.from_model_architecture(hparams["architectures"][0])
except NotImplementedError:
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)

View File

@ -14,9 +14,7 @@ In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox
- [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
- [Installing Essential Development Tools](#installing-essential-development-tools)
- [Adding the CUDA Repository](#adding-the-cuda-repository)
- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs)
- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts)
- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs)
- [Installing Nvidia Driver Libraries](#installing-nvidia-driver-libraries)
- [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package)
- [Configuring the Environment](#configuring-the-environment)
- [Verifying the Installation](#verifying-the-installation)
@ -67,7 +65,7 @@ This guide focuses on Fedora hosts, but with small adjustments, it can work for
sudo dnf distro-sync
```
2. **Install the Default Text Editor (Optional):**
2. **Install **Vim** the default text editor (Optional):**
```bash
sudo dnf install vim-default-editor --allowerasing
@ -97,36 +95,48 @@ After adding the repository, synchronize the package manager again:
sudo dnf distro-sync
```
## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
## Installing Nvidia Driver Libraries
We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go).
First, we need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go):
```bash
ls -la /usr/lib64/libcuda.so.1
```
### If *`libcuda.so.1`* is missing:
```
ls: cannot access '/usr/lib64/libcuda.so.1': No such file or directory
```
**Explanation:**
The host dose not supply the CUDA drivers, **install them now:**
- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA,
on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries.
### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found).
#### Install the Nvidia Driver Libraries on Guest:
```bash
sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
```
### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found).
### If *`libcuda.so.1`* exists:
```
lrwxrwxrwx. 1 root root 21 Mar 24 11:26 /usr/lib64/libcuda.so.1 -> libcuda.so.570.133.07
```
If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.
**Explanation:**
The host is supply the CUDA drivers, **we need to update the guest RPM Database accordingly:**
#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies)
#### Update the Toolbox RPM Database to include the Host-Supplied Libraries:
Note: we do not actually install the libraries, we just update the DB so that the guest system knows they are supplied by the host.
##### 1. Download `nvidia-` parts that are supplied by the host RPM's (with dependencies)
```bash
sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs
sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
```
#### 2. Update the RPM database to assume the installation of these packages.
##### 2. Update the RPM database to assume the installation of these packages.
```bash
sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
@ -134,23 +144,26 @@ sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
**Note:**
- The `--justdb` option only updates the RPM database, without touching the filesystem.
- The `--justdb` option only updates the RPM database, without touching the filesystem elsewhere.
#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
##### Check that the RPM Database has been correctly updated:
**Note:** This is the same command as in the *"Install the Nvidia Driver Libraries on Guest"* for if *`libcuda.so.1`* was missing.
After manually installing the dependencies, run:
```bash
sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
```
You should receive a message indicating the package is already installed:
*(this time it will not install anything, as the database things that these packages are already installed)*
```
Updating and loading repositories:
Repositories loaded.
Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed.
Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed.
Package "nvidia-driver-cuda-3:570.124.06-1.fc41.x86_64" is already installed.
Package "nvidia-driver-libs-3:570.124.06-1.fc41.x86_64" is already installed.
Package "nvidia-driver-cuda-libs-3:570.124.06-1.fc41.x86_64" is already installed.
Package "nvidia-persistenced-3:570.124.06-1.fc41.x86_64" is already installed.
Nothing to do.
```
@ -207,9 +220,9 @@ You should see output similar to:
```
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Wed_Jan_15_19:20:09_PST_2025
Cuda compilation tools, release 12.8, V12.8.61
Build cuda_12.8.r12.8/compiler.35404655_0
Built on Fri_Feb_21_20:23:50_PST_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0
```
This output confirms that the CUDA compiler is accessible and indicates the installed version.

View File

@ -145,8 +145,13 @@ A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the followi
* Clang 19
* Ninja
* Visual Studio 2022
* Powershell 7
Powershell is used for the following instructions.
Visual Studio provides necessary headers and libraries although it is not directly used for building.
Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.
Powershell 7 is used for the following commands.
If an older version of Powershell is used, these commands may not work as they are.
### I. Setup Environment
@ -196,10 +201,9 @@ ninja
## Known Issues
- Qwen2.5 0.5B model produces gibberish output with Adreno kernels.
- Currently OpenCL backend does not work on Adreno 6xx GPUs.
## TODO
- Fix Qwen2.5 0.5B
- Optimization for Q6_K
- Support and optimization for Q4_K

View File

@ -20,7 +20,7 @@
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
@ -227,30 +227,19 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
```sh
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
cmake --build buildWithCublas --config Release
git clone https://github.com/oneapi-src/oneDNN.git
cd oneDNN
cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake --build build-nvidia --config Release
```
- **Adding support to AMD GPUs**
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
```sh
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
# Find your HIPTARGET with rocminfo, under the key 'Name:'
cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
cmake --build buildWithrocBLAS --config Release
```
3. **Verify installation and environment**
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
@ -313,37 +302,39 @@ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -
cmake --build build --config Release -j -v
```
It is possible to come across some precision issues when running tests that stem from using faster
instructions, which can be circumvented by setting the environment variable `SYCL_PROGRAM_COMPILE_OPTIONS`
as `-cl-fp32-correctly-rounded-divide-sqrt`
#### Nvidia GPU
```sh
# Export relevant ENV variables
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
```sh
# Build LLAMA with Nvidia BLAS acceleration through SYCL
# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
# Option 2: Use FP16
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
# build all binary
cmake --build build --config Release -j -v
```
It is possible to come across some precision issues when running tests that stem from using faster
instructions, which can be circumvented by passing the `-fno-fast-math` flag to the compiler.
#### AMD GPU
```sh
# Export relevant ENV variables
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
```sh
# Build LLAMA with rocBLAS acceleration through SYCL
## AMD
@ -434,13 +425,13 @@ Examples:
- Use device 0:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
```
- Use multiple devices:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
*Notes:*
@ -484,6 +475,12 @@ b. Enable oneAPI running environment:
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
```
- if you are using Powershell, enable the runtime environment with the following:
```
cmd.exe "/K" '"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" && powershell'
```
c. Verify installation
In the oneAPI command line, run the following to print the available SYCL devices:
@ -514,13 +511,13 @@ You could download the release package for Windows directly, which including bin
Choose one of following methods to build from source code.
1. Script
#### 1. Script
```sh
.\examples\sycl\win-build-sycl.bat
```
2. CMake
#### 2. CMake
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
@ -549,13 +546,84 @@ cmake --preset x64-windows-sycl-debug
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
```
3. Visual Studio
#### 3. Visual Studio
You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
You have two options to use Visual Studio to build llama.cpp:
- As CMake Project using CMake presets.
- Creating a Visual Studio solution to handle the project.
**Note**:
All following commands are executed in PowerShell.
##### - Open as a CMake Project
You can use Visual Studio to open the `llama.cpp` folder directly as a CMake project. Before compiling, select one of the SYCL CMake presets:
- `x64-windows-sycl-release`
- `x64-windows-sycl-debug`
*Notes:*
- For a minimal experimental setup, you can build only the inference executable using:
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
```Powershell
cmake --build build --config Release -j --target llama-cli
```
##### - Generating a Visual Studio Solution
You can use Visual Studio solution to build and work on llama.cpp on Windows. You need to convert the CMake Project into a `.sln` file.
If you want to use the Intel C++ Compiler for the entire `llama.cpp` project, run the following command:
```Powershell
cmake -B build -G "Visual Studio 17 2022" -T "Intel C++ Compiler 2025" -A x64 -DGGML_SYCL=ON -DCMAKE_BUILD_TYPE=Release
```
If you prefer to use the Intel C++ Compiler only for `ggml-sycl`, ensure that `ggml` and its backend libraries are built as shared libraries ( i.e. `-DBUILD_SHARED_LIBRARIES=ON`, this is default behaviour):
```Powershell
cmake -B build -G "Visual Studio 17 2022" -A x64 -DGGML_SYCL=ON -DCMAKE_BUILD_TYPE=Release \
-DSYCL_INCLUDE_DIR="C:\Program Files (x86)\Intel\oneAPI\compiler\latest\include" \
-DSYCL_LIBRARY_DIR="C:\Program Files (x86)\Intel\oneAPI\compiler\latest\lib"
```
If successful the build files have been written to: *path/to/llama.cpp/build*
Open the project file **build/llama.cpp.sln** with Visual Studio.
Once the Visual Studio solution is created, follow these steps:
1. Open the solution in Visual Studio.
2. Right-click on `ggml-sycl` and select **Properties**.
3. In the left column, expand **C/C++** and select **DPC++**.
4. In the right panel, find **Enable SYCL Offload** and set it to `Yes`.
5. Apply the changes and save.
*Navigation Path:*
```
Properties -> C/C++ -> DPC++ -> Enable SYCL Offload (Yes)
```
Now, you can build `llama.cpp` with the SYCL backend as a Visual Studio project.
To do it from menu: `Build -> Build Solution`.
Once it is completed, final results will be in **build/Release/bin**
*Additional Note*
- You can avoid specifying `SYCL_INCLUDE_DIR` and `SYCL_LIBRARY_DIR` in the CMake command by setting the environment variables:
- `SYCL_INCLUDE_DIR_HINT`
- `SYCL_LIBRARY_DIR_HINT`
- Above instruction has been tested with Visual Studio 17 Community edition and oneAPI 2025.0. We expect them to work also with future version if the instructions are adapted accordingly.
### III. Run the inference
@ -629,13 +697,13 @@ Examples:
- Use device 0:
```
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
```
- Use multiple devices:
```
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
```
@ -660,8 +728,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|--------------------|---------------------------------------|---------------------------------------------|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
@ -671,6 +740,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |

View File

@ -132,12 +132,14 @@ You may find the official downloads here: [NVIDIA developer site](https://develo
#### Compile and run inside a Fedora Toolbox Container
We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
**Recommended for:**
- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
- ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
- (there are no supported CUDA packages for these systems)
- ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
- (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
- ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
@ -189,7 +191,7 @@ The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description |
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
@ -197,29 +199,54 @@ The following compilation options are also available to tweak performance:
## MUSA
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
- Using `CMake`:
#### Download directly from Moore Threads
```bash
cmake -B build -DGGML_MUSA=ON
cmake --build build --config Release
You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
### Compilation
```bash
cmake -B build -DGGML_MUSA=ON
cmake --build build --config Release
```
#### Override Compute Capability Specifications
By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
```bash
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
cmake --build build --config Release
```
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
#### Compilation options
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
- For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
```
For static build:
```bash
cmake -B build -DGGML_MUSA=ON \
-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
cmake --build build --config Release
```
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
### Runtime MUSA environmental variables
You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
```bash
# Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
```
### Unified Memory
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
## HIP
This provides GPU acceleration on HIP-supported AMD GPUs.
@ -232,8 +259,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16
```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
As an alternative, you can manually install the library by cloning it from the official [GitHub repository](https://github.com/ROCm/rocWMMA), checkout the corresponding version tag (e.g. `rocm-6.2.4`) and set `-DCMAKE_CXX_FLAGS="-I<path/to/rocwmma>/library/include/"` in CMake. This also works under Windows despite not officially supported by AMD.
Note that if you get the following error:
```
@ -263,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
### Unified Memory
On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
## Vulkan
**Windows**
@ -403,6 +438,116 @@ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
## Arm® KleidiAI™
KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
To enable KleidiAI, go to the llama.cpp directory and build using CMake
```bash
cmake -B build -DGGML_CPU_KLEIDIAI=ON
cmake --build build --config Release
```
You can verify that KleidiAI is being used by running
```bash
./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
```
If KleidiAI is enabled, the ouput will contain a line similar to:
```
load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB
```
KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
## OpenCL
This provides GPU acceleration through OpenCL on recent Adreno GPU.
More information about OpenCL backend can be found in [OPENCL.md](./backend/OPENCL.md) for more information.
### Android
Assume NDK is available in `$ANDROID_NDK`. First, install OpenCL headers and ICD loader library if not available,
```sh
mkdir -p ~/dev/llm
cd ~/dev/llm
git clone https://github.com/KhronosGroup/OpenCL-Headers && \
cd OpenCL-Headers && \
cp -r CL $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
cd ~/dev/llm
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
cd OpenCL-ICD-Loader && \
mkdir build_ndk && cd build_ndk && \
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DOPENCL_ICD_LOADER_HEADERS_DIR=$ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=24 \
-DANDROID_STL=c++_shared && \
ninja && \
cp libOpenCL.so $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
```
Then build llama.cpp with OpenCL enabled,
```sh
cd ~/dev/llm
git clone https://github.com/ggml-org/llama.cpp && \
cd llama.cpp && \
mkdir build-android && cd build-android
cmake .. -G Ninja \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DBUILD_SHARED_LIBS=OFF \
-DGGML_OPENCL=ON
ninja
```
### Windows Arm64
First, install OpenCL headers and ICD loader library if not available,
```powershell
mkdir -p ~/dev/llm
cd ~/dev/llm
git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
mkdir build && cd build
cmake .. -G Ninja `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
cmake --build . --target install
cd ~/dev/llm
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
mkdir build && cd build
cmake .. -G Ninja `
-DCMAKE_BUILD_TYPE=Release `
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
cmake --build . --target install
```
Then build llama.cpp with OpenCL enabled,
```powershell
cmake .. -G Ninja `
-DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
-DCMAKE_BUILD_TYPE=Release `
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
-DBUILD_SHARED_LIBS=OFF `
-DGGML_OPENCL=ON
ninja
```
## Android
To read documentation for how to build on Android, [click here](./android.md)

View File

@ -287,30 +287,32 @@ Here are some models known to work (w/ chat template override when needed):
llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
# Native support for DeepSeek R1 works best w/ our own template (official template buggy)
# Native support for DeepSeek R1 works best w/ our template override (official template is buggy, although we do work around it)
llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
# Native support requires the right template for these GGUFs:
llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
--chat-template-file models/templates/meetkai-functionary-medium-v3.2.jinja
llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use )
--chat-template-file models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
--chat-template-file models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
--chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
--chat-template-file models/templates/fireworks-ai-llama-3-firefunction-v2.jinja
llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
--chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
--chat-template-file models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja
# Generic format support
llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0
@ -318,6 +320,8 @@ llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0
llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K
```
To get the official template from original HuggingFace repos, you can use [scripts/get_chat_template.py](../scripts/get_chat_template.py) (see examples invocations in [models/templates/README.md](../models/templates/README.md))
> [!TIP]
> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)

View File

@ -9,6 +9,13 @@ brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
## MacPorts
```sh
sudo port install llama.cpp
```
see also: https://ports.macports.org/port/llama.cpp/details/
## Nix
On Mac and Linux, the Nix package manager can be used via

View File

@ -9,15 +9,15 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
## Usage
Build with cmake or run `make llama-llava-cli` to build it.
After building, run: `./llama-llava-cli` to see the usage. For example:
Build the `llama-mtmd-cli` binary.
After building, run: `./llama-mtmd-cli` to see the usage. For example:
```sh
./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
./llama-mtmd-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
--image path/to/an/image.jpg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
--chat-template deepseek
```
## Model conversion
@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
### case 1
**input**
```sh
/data/local/tmp/llama-llava-cli \
/data/local/tmp/llama-mtmd-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
@ -102,7 +102,7 @@ llama_print_timings: total time = 34731.93 ms
### case 2
**input**
```sh
/data/local/tmp/llama-llava-cli \
/data/local/tmp/llama-mtmd-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
@ -123,10 +123,10 @@ llama_print_timings: total time = 34570.79 ms
## Some result on Android with `Snapdragon 778G` chip
### MobileVLM-1.7B case
#### llava-cli release-b2005
#### mtmd-cli release-b2005
**input**
```sh
/data/local/tmp/llama-llava-cli \
/data/local/tmp/llama-mtmd-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
@ -147,7 +147,7 @@ llama_print_timings: prompt eval time = 8119.49 ms / 191 tokens ( 42.51 m
llama_print_timings: eval time = 1005.75 ms / 14 runs ( 71.84 ms per token, 13.92 tokens per second)
llama_print_timings: total time = 28038.34 ms / 205 tokens
```
#### llava-cli latest-version
#### mtmd-cli latest-version
**input**
Just the same as above.
@ -169,7 +169,7 @@ llama_print_timings: eval time = 43894.02 ms / 13 runs ( 3376.46 m
llama_print_timings: total time = 865441.76 ms / 204 tokens
```
### MobileVLM_V2-1.7B case
#### llava-cli release-2005b
#### mtmd-cli release-2005b
**input**
Just the same as above.
@ -200,7 +200,7 @@ make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
### case 1
**input**
```sh
./llama-llava-cli \
./llama-mtmd-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
--image /data/local/tmp/demo.jpeg \
@ -224,7 +224,7 @@ llama_print_timings: total time = 1352.63 ms / 252 tokens
### case 2
**input**
```sh
./llama-llava-cli \
./llama-mtmd-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \

51
docs/multimodal/gemma3.md Normal file
View File

@ -0,0 +1,51 @@
# Gemma 3 vision
> [!IMPORTANT]
>
> This is very experimental, only used for demo purpose.
## Quick started
You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
```bash
# build
cmake -B build
cmake --build build --target llama-mtmd-cli
# alternatively, install from brew (MacOS)
brew install llama.cpp
# run it
llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
llama-mtmd-cli -hf ggml-org/gemma-3-12b-it-GGUF
llama-mtmd-cli -hf ggml-org/gemma-3-27b-it-GGUF
# note: 1B model does not support vision
```
## How to get mmproj.gguf?
Simply to add `--mmproj` in when converting model via `convert_hf_to_gguf.py`:
```bash
cd gemma-3-4b-it
python ../llama.cpp/convert_hf_to_gguf.py --outfile model.gguf --outtype f16 --mmproj .
# output file: mmproj-model.gguf
```
## How to run it?
What you need:
- The text model GGUF, can be converted using `convert_hf_to_gguf.py`
- The mmproj file from step above
- An image file
```bash
# build
cmake -B build
cmake --build build --target llama-mtmd-cli
# run it
./build/bin/llama-mtmd-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
```

View File

@ -3,12 +3,12 @@
Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b).
## Usage
Build with cmake or run `make llama-llava-cli` to build it.
Build the `llama-mtmd-cli` binary.
After building, run: `./llama-llava-cli` to see the usage. For example:
After building, run: `./llama-mtmd-cli` to see the usage. For example:
```sh
./llama-llava-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf --image img_path/image.jpg -p "<|system|>\n system prompt <image><|user|>\n prompt <|assistant|>\n"
./llama-mtmd-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf
```
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.

View File

@ -3,8 +3,8 @@
Download the model and point your `GRANITE_MODEL` environment variable to the path.
```bash
$ git clone https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview
$ export GRANITE_MODEL=./granite-vision-3.1-2b-preview
$ git clone https://huggingface.co/ibm-granite/granite-vision-3.2-2b
$ export GRANITE_MODEL=./granite-vision-3.2-2b
```
@ -41,10 +41,18 @@ If you actually inspect the `.keys()` of the loaded tensors, you should see a lo
### 2. Creating the Visual Component GGUF
To create the GGUF for the visual components, we need to write a config for the visual encoder; make sure the config contains the correct `image_grid_pinpoints`
Next, create a new directory to hold the visual components, and copy the llava.clip/projector files, as shown below.
```bash
$ ENCODER_PATH=$PWD/visual_encoder
$ mkdir $ENCODER_PATH
$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
```
Now, we need to write a config for the visual encoder. In order to convert the model, be sure to use the correct `image_grid_pinpoints`, as these may vary based on the model. You can find the `image_grid_pinpoints` in `$GRANITE_MODEL/config.json`.
Note: we refer to this file as `$VISION_CONFIG` later on.
```json
{
"_name_or_path": "siglip-model",
@ -52,6 +60,7 @@ Note: we refer to this file as `$VISION_CONFIG` later on.
"SiglipVisionModel"
],
"image_grid_pinpoints": [
[384,384],
[384,768],
[384,1152],
[384,1536],
@ -94,24 +103,13 @@ Note: we refer to this file as `$VISION_CONFIG` later on.
}
```
Create a new directory to hold the visual components, and copy the llava.clip/projector files, as well as the vision config into it.
```bash
$ ENCODER_PATH=$PWD/visual_encoder
$ mkdir $ENCODER_PATH
$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
$ cp $VISION_CONFIG $ENCODER_PATH/config.json
```
At which point you should have something like this:
At this point you should have something like this:
```bash
$ ls $ENCODER_PATH
config.json llava.projector pytorch_model.bin
```
Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the siglip visual encoder - in the transformers model, you can find these numbers in the [preprocessor_config.json](https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview/blob/main/preprocessor_config.json).
Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the SigLIP visual encoder - in the transformers model, you can find these numbers in the `preprocessor_config.json`.
```bash
$ python convert_image_encoder_to_gguf.py \
-m $ENCODER_PATH \
@ -119,17 +117,18 @@ $ python convert_image_encoder_to_gguf.py \
--output-dir $ENCODER_PATH \
--clip-model-is-vision \
--clip-model-is-siglip \
--image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
--image-mean 0.5 0.5 0.5 \
--image-std 0.5 0.5 0.5
```
this will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the abs path of this file as the `$VISUAL_GGUF_PATH.`
This will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the absolute path of this file as the `$VISUAL_GGUF_PATH.`
### 3. Creating the LLM GGUF.
The granite vision model contains a granite LLM as its language model. For now, the easiest way to get the GGUF for LLM is by loading the composite model in `transformers` and exporting the LLM so that it can be directly converted with the normal conversion path.
First, set the `LLM_EXPORT_PATH` to the path to export the `transformers` LLM to.
```
```bash
$ export LLM_EXPORT_PATH=$PWD/granite_vision_llm
```
@ -142,7 +141,7 @@ if not MODEL_PATH:
raise ValueError("env var GRANITE_MODEL is unset!")
LLM_EXPORT_PATH = os.getenv("LLM_EXPORT_PATH")
if not MODEL_PATH:
if not LLM_EXPORT_PATH:
raise ValueError("env var LLM_EXPORT_PATH is unset!")
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
@ -166,18 +165,22 @@ $ python convert_hf_to_gguf.py --outfile $LLM_GGUF_PATH $LLM_EXPORT_PATH
```
### 4. Running the Model in Llama cpp
Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. Sample usage:
Note - the test image shown below can be found [here](https://github-production-user-asset-6210df.s3.amazonaws.com/10740300/415512792-d90d5562-8844-4f34-a0a5-77f62d5a58b5.jpg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20250221%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250221T054145Z&X-Amz-Expires=300&X-Amz-Signature=86c60be490aa49ef7d53f25d6c973580a8273904fed11ed2453d0a38240ee40a&X-Amz-SignedHeaders=host).
### 4. Quantization
If you want to quantize the LLM, you can do so with `llama-quantize` as you would any other LLM. For example:
```bash
$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
--mmproj $VISUAL_GGUF_PATH \
--image cherry_blossom.jpg \
-c 16384 \
-p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat type of flowers are in this picture?\n<|assistant|>\n" \
--temp 0
$ ./build/bin/llama-quantize $LLM_EXPORT_PATH/granite_llm.gguf $LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf Q4_K_M
$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf
```
Sample response: `The flowers in the picture are cherry blossoms, which are known for their delicate pink petals and are often associated with the beauty of spring.`
Note that currently you cannot quantize the visual encoder because granite vision models use SigLIP as the visual encoder, which has tensor dimensions that are not divisible by 32.
### 5. Running the Model in Llama cpp
Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
```bash
$ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \
--mmproj $VISUAL_GGUF_PATH \
-c 16384 \
--temp 0
```

143
docs/multimodal/llava.md Normal file
View File

@ -0,0 +1,143 @@
# LLaVA
Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants,
as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants.
The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
models are available.
For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf)
After API is confirmed, more models will be supported / uploaded.
## Usage
Build the `llama-mtmd-cli` binary.
After building, run: `./llama-mtmd-cli` to see the usage. For example:
```sh
./llama-mtmd-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf \
--mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf \
--chat-template vicuna
```
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
## LLaVA 1.5
1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
```sh
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
```
2. Install the required Python packages:
```sh
pip install -r examples/llava/requirements.txt
```
3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
```sh
python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
```
4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
```sh
python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
```
5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
```
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
## LLaVA 1.6 gguf conversion
1) First clone a LLaVA 1.6 model:
```console
git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
```
2) Install the required Python packages:
```sh
pip install -r examples/llava/requirements.txt
```
3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
```console
python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
```
- you will find a llava.projector and a llava.clip file in your model directory
4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
```console
mkdir vit
cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
cp ../llava-v1.6-vicuna-7b/llava.projector vit/
curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
```
5) Create the visual gguf model:
```console
python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
```
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
6) Then convert the model to gguf format:
```console
python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
```
7) And finally we can run the llava cli using the 1.6 model version:
```console
./llama-mtmd-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf
```
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
**note** if the language model in step `6)` is incompatible with the legacy conversion script, the easiest way handle the LLM model conversion is to load the model in transformers, and export only the LLM from the llava next model.
```python
import os
import transformers
model_path = ...
llm_export_path = ...
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
model = transformers.AutoModelForImageTextToText.from_pretrained(model_path)
tokenizer.save_pretrained(llm_export_path)
model.language_model.save_pretrained(llm_export_path)
```
Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures.
## Chat template
For llava-1.5 and llava-1.6, you need to use `vicuna` chat template. Simply add `--chat-template vicuna` to activate this template.
## How to know if you are running in llava-1.5 or llava-1.6 mode
When running llava-cli you will see a visual information right before the prompt is being processed:
**Llava-1.5:**
`encode_image_with_clip: image embedding created: 576 tokens`
**Llava-1.6 (anything above 576):**
`encode_image_with_clip: image embedding created: 2880 tokens`
Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6

View File

@ -5,13 +5,25 @@ Currently, this readme only supports minicpm-omni's image capabilities, and we w
Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone git@github.com:OpenBMB/llama.cpp.git
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
git checkout minicpm-omni
```
Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
### Usage of MiniCPM-o 2.6
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
@ -22,25 +34,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
# quantize int4 version
./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
./build/bin/llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Build llama.cpp using `CMake`:
https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md
```bash
cmake -B build
cmake --build build --config Release
```
Inference on Linux or Mac
```
# run f16 version
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
```bash
# run in single-turn mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
# run in conversation mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf
```

View File

@ -0,0 +1,47 @@
## MiniCPM-Llama3-V 2.5
### Prepare models and code
Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```
Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
### Usage of MiniCPM-Llama3-V 2.5
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
# quantize int4 version
./build/bin/llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Inference on Linux or Mac
```bash
# run in single-turn mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run in conversation mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf
```

View File

@ -0,0 +1,47 @@
## MiniCPM-V 2.6
### Prepare models and code
Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
```
Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
### Usage of MiniCPM-V 2.6
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
# quantize int4 version
./build/bin/llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Inference on Linux or Mac
```bash
# run in single-turn mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run in conversation mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf
```

View File

@ -21,11 +21,6 @@ else()
add_subdirectory(embedding)
add_subdirectory(eval-callback)
if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(gbnf-validator)
endif()
add_subdirectory(gguf-hash)
add_subdirectory(gguf-split)
add_subdirectory(gguf)
@ -58,10 +53,6 @@ else()
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(cvector-generator)
add_subdirectory(export-lora)
if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(quantize-stats)
endif()
add_subdirectory(llava)
if (GGML_RPC)
add_subdirectory(rpc)

View File

@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@ -132,7 +132,7 @@ int main(int argc, char ** argv) {
const auto t_pp_start = ggml_time_us();
llama_kv_cache_clear(ctx);
llama_kv_self_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_ERR("%s: llama_decode() failed\n", __func__);
@ -141,7 +141,7 @@ int main(int argc, char ** argv) {
if (is_pp_shared) {
for (int32_t i = 1; i < pl; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
}
}

View File

@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
}
for i in 1 ..< n_parallel {
llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
}
if n_parallel > 1 {

View File

@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: error: unable to load model\n" , __func__);

View File

@ -342,7 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
}
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
llama_kv_cache_clear(ctx);
llama_kv_self_clear(ctx);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
@ -394,6 +394,8 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
int main(int argc, char ** argv) {
common_params params;
params.out_file = "control_vector.gguf";
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
return 1;
}
@ -498,7 +500,7 @@ int main(int argc, char ** argv) {
}
// write output vectors to gguf
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
export_gguf(ctx_train.v_final, params.out_file, model_hint);
llama_backend_free();

View File

@ -4,6 +4,7 @@
#include "llama.h"
#include <ctime>
#include <algorithm>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@ -37,7 +38,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
const struct llama_model * model = llama_get_model(ctx);
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
llama_kv_self_clear(ctx);
// run model
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
@ -88,6 +89,13 @@ int main(int argc, char ** argv) {
common_init();
params.embedding = true;
// utilize the full context
if (params.n_batch < params.n_ctx) {
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
params.n_batch = params.n_ctx;
}
// For non-causal models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
@ -133,7 +141,6 @@ int main(int argc, char ** argv) {
// max batch size
const uint64_t n_batch = params.n_batch;
GGML_ASSERT(params.n_batch >= params.n_ctx);
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;

View File

@ -413,20 +413,22 @@ static void print_usage(int, char ** argv) {
int main(int argc, char ** argv) {
common_params params;
params.out_file = "ggml-lora-merged-f16.gguf";
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
return 1;
}
g_verbose = (params.verbosity > 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());
exit(EXIT_FAILURE);
}
printf("done, output file is %s\n", params.lora_outfile.c_str());
printf("done, output file is %s\n", params.out_file.c_str());
return 0;
}

View File

@ -1,5 +0,0 @@
set(TARGET llama-gbnf-validator)
add_executable(${TARGET} gbnf-validator.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -408,8 +408,6 @@ static void gguf_merge(const split_params & split_params) {
exit(EXIT_FAILURE);
}
std::ofstream fout(split_params.output.c_str(), std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
auto * ctx_out = gguf_init_empty();
@ -453,7 +451,6 @@ static void gguf_merge(const split_params & split_params) {
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
@ -466,7 +463,6 @@ static void gguf_merge(const split_params & split_params) {
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
@ -479,7 +475,6 @@ static void gguf_merge(const split_params & split_params) {
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
@ -500,9 +495,11 @@ static void gguf_merge(const split_params & split_params) {
fprintf(stderr, "\033[3Ddone\n");
}
// placeholder for the meta data
{
std::ofstream fout;
if (!split_params.dry_run) {
fout.open(split_params.output.c_str(), std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
// placeholder for the meta data
auto meta_size = gguf_get_meta_size(ctx_out);
::zeros(fout, meta_size);
}
@ -518,7 +515,9 @@ static void gguf_merge(const split_params & split_params) {
ggml_free(ctx_metas[i]);
}
gguf_free(ctx_out);
fout.close();
if (!split_params.dry_run) {
fout.close();
}
exit(EXIT_FAILURE);
}
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
@ -540,10 +539,11 @@ static void gguf_merge(const split_params & split_params) {
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
f_input.seekg(offset);
f_input.read((char *)read_data.data(), n_bytes);
// write tensor data + padding
fout.write((const char *)read_data.data(), n_bytes);
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
if (!split_params.dry_run) {
// write tensor data + padding
fout.write((const char *)read_data.data(), n_bytes);
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
}
}
gguf_free(ctx_gguf);
@ -552,16 +552,15 @@ static void gguf_merge(const split_params & split_params) {
fprintf(stderr, "\033[3Ddone\n");
}
{
if (!split_params.dry_run) {
// go back to beginning of file and write the updated metadata
fout.seekp(0);
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.write((const char *)data.data(), data.size());
fout.close();
gguf_free(ctx_out);
}
gguf_free(ctx_out);
fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n",
__func__, split_params.output.c_str(), n_split, total_tensors);

View File

@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
}
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
llama_kv_self_clear(ctx);
llama_set_embeddings(ctx, true);
llama_set_causal_attn(ctx, false);
@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
llama_token eos_token = llama_vocab_eos(vocab);
llama_kv_cache_clear(ctx);
llama_kv_self_clear(ctx);
llama_set_embeddings(ctx, false);
llama_set_causal_attn(ctx, true);
@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {
llama_backend_init();
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
// create generation context
llama_context * ctx = llama_init_from_model(model, cparams);

View File

@ -206,9 +206,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
void IMatrixCollector::save_imatrix(int ncall) const {
auto fname = m_params.out_file;
if (fname.empty()) {
fname = "imatrix.dat";
}
if (ncall > 0) {
fname += ".at_";
@ -498,7 +495,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_cache_clear(ctx);
llama_kv_self_clear(ctx);
llama_batch batch = llama_batch_init(n_batch, 0, 1);
@ -583,6 +580,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
int main(int argc, char ** argv) {
common_params params;
params.out_file = "imatrix.dat" ;
params.n_ctx = 512;
params.logits_all = true;
params.escape = false;

View File

@ -332,8 +332,8 @@ int main(int argc, char ** argv) {
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
n_past -= n_discard;

View File

@ -10,6 +10,9 @@ from typing import Any, List, Optional, Set, Tuple, Union
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
if max_items == 0:
return ""
if min_items == 0 and max_items == 1:
return f'{item_rule}?'
@ -195,7 +198,7 @@ class BuiltinRule:
self.deps = deps or []
# Constraining spaces to prevent model "running away".
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'
PRIMITIVE_RULES = {
'boolean' : BuiltinRule('("true" | "false") space', []),

View File

@ -28,6 +28,7 @@ options:
-p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128)
-pg <pp,tg> (default: )
-d, --n-depth <n> (default: 0)
-b, --batch-size <n> (default: 2048)
-ub, --ubatch-size <n> (default: 512)
-ctk, --cache-type-k <t> (default: f16)
@ -66,6 +67,8 @@ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple
Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
Using the `-d <n>` option, each test can be run at a specified context depth, prefilling the KV cache with `<n>` tokens.
For a description of the other options, see the [main example](../main/README.md).
Note:
@ -148,6 +151,19 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 |
### Different prefilled context
```
$ ./llama-bench -d 0,512
```
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 |
## Output formats
By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
@ -170,9 +186,9 @@ $ ./llama-bench -o csv
```
```csv
build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
```
### JSON
@ -184,64 +200,78 @@ $ ./llama-bench -o json
```json
[
{
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
"metal": false,
"gpu_blas": true,
"blas": true,
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
"model_filename": "models/7B/ggml-model-q4_0.gguf",
"model_type": "llama 7B mostly Q4_0",
"model_size": 3825065984,
"model_n_params": 6738415616,
"n_batch": 512,
"n_threads": 16,
"f16_kv": true,
"build_commit": "8cf427ff",
"build_number": 5163,
"cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
"gpu_info": "NVIDIA GeForce RTX 4080",
"backends": "CUDA",
"model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
"model_type": "qwen2 7B Q4_K - Medium",
"model_size": 4677120000,
"model_n_params": 7615616512,
"n_batch": 2048,
"n_ubatch": 512,
"n_threads": 8,
"cpu_mask": "0x0",
"cpu_strict": false,
"poll": 50,
"type_k": "f16",
"type_v": "f16",
"n_gpu_layers": 99,
"split_mode": "layer",
"main_gpu": 0,
"mul_mat_q": true,
"no_kv_offload": false,
"flash_attn": false,
"tensor_split": "0.00",
"use_mmap": true,
"embeddings": false,
"n_prompt": 512,
"n_gen": 0,
"test_time": "2023-09-23T12:09:57Z",
"avg_ns": 212365953,
"stddev_ns": 985423,
"avg_ts": 2410.974041,
"stddev_ts": 11.163766,
"samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
"samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
"n_depth": 0,
"test_time": "2025-04-24T11:58:50Z",
"avg_ns": 72135640,
"stddev_ns": 1453752,
"avg_ts": 7100.002165,
"stddev_ts": 140.341520,
"samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ],
"samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ]
},
{
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
"metal": false,
"gpu_blas": true,
"blas": true,
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
"model_filename": "models/7B/ggml-model-q4_0.gguf",
"model_type": "llama 7B mostly Q4_0",
"model_size": 3825065984,
"model_n_params": 6738415616,
"n_batch": 512,
"n_threads": 16,
"f16_kv": true,
"build_commit": "8cf427ff",
"build_number": 5163,
"cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
"gpu_info": "NVIDIA GeForce RTX 4080",
"backends": "CUDA",
"model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
"model_type": "qwen2 7B Q4_K - Medium",
"model_size": 4677120000,
"model_n_params": 7615616512,
"n_batch": 2048,
"n_ubatch": 512,
"n_threads": 8,
"cpu_mask": "0x0",
"cpu_strict": false,
"poll": 50,
"type_k": "f16",
"type_v": "f16",
"n_gpu_layers": 99,
"split_mode": "layer",
"main_gpu": 0,
"mul_mat_q": true,
"no_kv_offload": false,
"flash_attn": false,
"tensor_split": "0.00",
"use_mmap": true,
"embeddings": false,
"n_prompt": 0,
"n_gen": 128,
"test_time": "2023-09-23T12:09:59Z",
"avg_ns": 977425219,
"stddev_ns": 9268593,
"avg_ts": 130.965708,
"stddev_ts": 1.238924,
"samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
"samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
"n_depth": 0,
"test_time": "2025-04-24T11:58:51Z",
"avg_ns": 1076767880,
"stddev_ns": 9449585,
"avg_ts": 118.881588,
"stddev_ts": 1.041811,
"samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ],
"samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ]
}
]
```
@ -254,8 +284,8 @@ $ ./llama-bench -o jsonl
```
```json lines
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
```
@ -271,25 +301,32 @@ $ ./llama-bench -o sql
CREATE TABLE IF NOT EXISTS test (
build_commit TEXT,
build_number INTEGER,
cuda INTEGER,
metal INTEGER,
gpu_blas INTEGER,
blas INTEGER,
cpu_info TEXT,
gpu_info TEXT,
backends TEXT,
model_filename TEXT,
model_type TEXT,
model_size INTEGER,
model_n_params INTEGER,
n_batch INTEGER,
n_ubatch INTEGER,
n_threads INTEGER,
f16_kv INTEGER,
cpu_mask TEXT,
cpu_strict INTEGER,
poll INTEGER,
type_k TEXT,
type_v TEXT,
n_gpu_layers INTEGER,
split_mode TEXT,
main_gpu INTEGER,
mul_mat_q INTEGER,
no_kv_offload INTEGER,
flash_attn INTEGER,
tensor_split TEXT,
use_mmap INTEGER,
embeddings INTEGER,
n_prompt INTEGER,
n_gen INTEGER,
n_depth INTEGER,
test_time TEXT,
avg_ns INTEGER,
stddev_ns INTEGER,
@ -297,6 +334,6 @@ CREATE TABLE IF NOT EXISTS test (
stddev_ts REAL
);
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
```

View File

@ -36,6 +36,46 @@ static uint64_t get_time_ns() {
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
}
static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
if (a.pattern != b.pattern) {
// cString comparison that may be null
if (a.pattern == nullptr || b.pattern == nullptr) {
return false;
}
if (strcmp(a.pattern, b.pattern) != 0) {
return false;
}
}
if (a.buft != b.buft) {
return false;
}
return true;
}
static bool vec_tensor_buft_override_equal(const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
if (a.size() != b.size()) {
return false;
}
for (size_t i = 0; i < a.size(); i++) {
if (!tensor_buft_override_equal(a[i], b[i])) {
return false;
}
}
return true;
}
static bool vec_vec_tensor_buft_override_equal(const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
if (a.size() != b.size()) {
return false;
}
for (size_t i = 0; i < a.size(); i++) {
if (!vec_tensor_buft_override_equal(a[i], b[i])) {
return false;
}
}
return true;
}
template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
std::ostringstream str;
for (size_t i = 0; i < values.size(); i++) {
@ -160,6 +200,7 @@ struct cmd_params {
std::vector<int> n_prompt;
std::vector<int> n_gen;
std::vector<std::pair<int, int>> n_pg;
std::vector<int> n_depth;
std::vector<int> n_batch;
std::vector<int> n_ubatch;
std::vector<ggml_type> type_k;
@ -175,6 +216,7 @@ struct cmd_params {
std::vector<bool> no_kv_offload;
std::vector<bool> flash_attn;
std::vector<std::vector<float>> tensor_split;
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
std::vector<bool> use_mmap;
std::vector<bool> embeddings;
ggml_numa_strategy numa;
@ -192,6 +234,7 @@ static const cmd_params cmd_params_defaults = {
/* n_prompt */ { 512 },
/* n_gen */ { 128 },
/* n_pg */ {},
/* n_depth */ { 0 },
/* n_batch */ { 2048 },
/* n_ubatch */ { 512 },
/* type_k */ { GGML_TYPE_F16 },
@ -207,6 +250,7 @@ static const cmd_params cmd_params_defaults = {
/* no_kv_offload */ { false },
/* flash_attn */ { false },
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr,nullptr}} },
/* use_mmap */ { true },
/* embeddings */ { false },
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
@ -230,6 +274,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
printf(" -d, --n-depth <n> (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n",
join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub, --ubatch-size <n> (default: %s)\n",
@ -265,6 +310,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -embd, --embeddings <0|1> (default: %s)\n",
join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
@ -366,6 +412,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
}
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
} else if (arg == "-d" || arg == "--n-depth") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
} else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) {
invalid_param = true;
@ -557,6 +610,87 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
params.tensor_split.push_back(tensor_split);
}
} else if (arg == "-ot" || arg == "--override-tensor") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto value = argv[i];
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
if (buft_list.empty()) {
// enumerate all the devices and add their buffer types to the list
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i);
auto * buft = ggml_backend_dev_buffer_type(dev);
if (buft) {
buft_list[ggml_backend_buft_name(buft)] = buft;
}
}
}
auto override_group_span_len = std::strcspn(value, ",");
bool last_group = false;
do {
if (override_group_span_len == 0) {
// Adds an empty override-tensors for an empty span
params.tensor_buft_overrides.push_back({{}});
if (value[override_group_span_len] == '\0') {
value = &value[override_group_span_len];
last_group = true;
} else {
value = &value[override_group_span_len + 1];
override_group_span_len = std::strcspn(value, ",");
}
continue;
}
// Stamps null terminators into the argv
// value for this option to avoid the
// memory leak present in the implementation
// over in arg.cpp. Acceptable because we
// only parse these args once in this program.
auto override_group = value;
if (value[override_group_span_len] == '\0') {
value = &value[override_group_span_len];
last_group = true;
} else {
value[override_group_span_len] = '\0';
value = &value[override_group_span_len + 1];
}
std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
auto override_span_len = std::strcspn(override_group, ";");
while (override_span_len > 0) {
auto override = override_group;
if (override_group[override_span_len] != '\0') {
override_group[override_span_len] = '\0';
override_group = &override_group[override_span_len + 1];
} else {
override_group = &override_group[override_span_len];
}
auto tensor_name_span_len = std::strcspn(override, "=");
if (tensor_name_span_len >= override_span_len) {
invalid_param = true;
break;
}
override[tensor_name_span_len] = '\0';
auto tensor_name = override;
auto buffer_type = &override[tensor_name_span_len + 1];
if (buft_list.find(buffer_type) == buft_list.end()) {
printf("Available buffer types:\n");
for (const auto & it : buft_list) {
printf(" %s\n", ggml_backend_buft_name(it.second));
}
invalid_param = true;
break;
}
group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
override_span_len = std::strcspn(override_group, ";");
}
if (invalid_param) {
break;
}
group_tensor_buft_overrides.push_back({nullptr,nullptr});
params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
override_group_span_len = std::strcspn(value, ",");
} while (!last_group);
} else if (arg == "-r" || arg == "--repetitions") {
if (++i >= argc) {
invalid_param = true;
@ -615,6 +749,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.n_pg.empty()) {
params.n_pg = cmd_params_defaults.n_pg;
}
if (params.n_depth.empty()) {
params.n_depth = cmd_params_defaults.n_depth;
}
if (params.n_batch.empty()) {
params.n_batch = cmd_params_defaults.n_batch;
}
@ -648,6 +785,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.tensor_split.empty()) {
params.tensor_split = cmd_params_defaults.tensor_split;
}
if (params.tensor_buft_overrides.empty()) {
params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
}
if (params.use_mmap.empty()) {
params.use_mmap = cmd_params_defaults.use_mmap;
}
@ -674,6 +814,7 @@ struct cmd_params_instance {
std::string model;
int n_prompt;
int n_gen;
int n_depth;
int n_batch;
int n_ubatch;
ggml_type type_k;
@ -689,6 +830,7 @@ struct cmd_params_instance {
bool no_kv_offload;
bool flash_attn;
std::vector<float> tensor_split;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
bool use_mmap;
bool embeddings;
@ -733,19 +875,26 @@ struct cmd_params_instance {
mparams.tensor_split = tensor_split.data();
mparams.use_mmap = use_mmap;
if (tensor_buft_overrides.empty()) {
mparams.tensor_buft_overrides = nullptr;
} else {
GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
mparams.tensor_buft_overrides = tensor_buft_overrides.data();
}
return mparams;
}
bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
tensor_split == other.tensor_split;
tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
}
llama_context_params to_llama_cparams() const {
llama_context_params cparams = llama_context_default_params();
cparams.n_ctx = n_prompt + n_gen;
cparams.n_ctx = n_prompt + n_gen + n_depth;
cparams.n_batch = n_batch;
cparams.n_ubatch = n_ubatch;
cparams.type_k = type_k;
@ -769,6 +918,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
for (const auto & ot : params.tensor_buft_overrides)
for (const auto & mmp : params.use_mmap)
for (const auto & embd : params.embeddings)
for (const auto & nb : params.n_batch)
@ -780,6 +930,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & nt : params.n_threads)
for (const auto & cm : params.cpu_mask)
for (const auto & cs : params.cpu_strict)
for (const auto & nd : params.n_depth)
for (const auto & pl : params.poll) {
for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) {
@ -789,6 +940,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .model = */ m,
/* .n_prompt = */ n_prompt,
/* .n_gen = */ 0,
/* .n_depth = */ nd,
/* .n_batch = */ nb,
/* .n_ubatch = */ nub,
/* .type_k = */ tk,
@ -804,6 +956,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .tensor_split = */ ts,
/* .tensor_buft_overrides = */ ot,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
};
@ -818,6 +971,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .model = */ m,
/* .n_prompt = */ 0,
/* .n_gen = */ n_gen,
/* .n_depth = */ nd,
/* .n_batch = */ nb,
/* .n_ubatch = */ nub,
/* .type_k = */ tk,
@ -833,6 +987,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .tensor_split = */ ts,
/* .tensor_buft_overrides = */ ot,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
};
@ -847,6 +1002,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .model = */ m,
/* .n_prompt = */ n_pg.first,
/* .n_gen = */ n_pg.second,
/* .n_depth = */ nd,
/* .n_batch = */ nb,
/* .n_ubatch = */ nub,
/* .type_k = */ tk,
@ -862,6 +1018,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .tensor_split = */ ts,
/* .tensor_buft_overrides = */ ot,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
};
@ -896,10 +1053,12 @@ struct test {
bool no_kv_offload;
bool flash_attn;
std::vector<float> tensor_split;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
bool use_mmap;
bool embeddings;
int n_prompt;
int n_gen;
int n_depth;
std::string test_time;
std::vector<uint64_t> samples_ns;
@ -927,10 +1086,12 @@ struct test {
no_kv_offload = inst.no_kv_offload;
flash_attn = inst.flash_attn;
tensor_split = inst.tensor_split;
tensor_buft_overrides = inst.tensor_buft_overrides;
use_mmap = inst.use_mmap;
embeddings = inst.embeddings;
n_prompt = inst.n_prompt;
n_gen = inst.n_gen;
n_depth = inst.n_depth;
// RFC 3339 date-time format
time_t t = time(NULL);
std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
@ -972,9 +1133,9 @@ struct test {
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
"embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns",
"avg_ts", "stddev_ts",
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
"use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time",
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
};
return fields;
}
@ -984,8 +1145,8 @@ struct test {
static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
field == "stddev_ns") {
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
field == "avg_ns" || field == "stddev_ns") {
return INT;
}
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
@ -1000,6 +1161,7 @@ struct test {
std::vector<std::string> get_values() const {
std::string tensor_split_str;
std::string tensor_buft_overrides_str;
int max_nonzero = 0;
for (size_t i = 0; i < llama_max_devices(); i++) {
if (tensor_split[i] > 0) {
@ -1014,6 +1176,26 @@ struct test {
tensor_split_str += "/";
}
}
if (tensor_buft_overrides.size() == 1) {
// Last element of tensor_buft_overrides is always a null pattern
// so if it is only one element long, it must be a null pattern.
GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr);
tensor_buft_overrides_str += "none";
} else {
for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) {
// Last element of tensor_buft_overrides is always a null pattern
if (tensor_buft_overrides[i].pattern == nullptr) {
tensor_buft_overrides_str += "none";
} else {
tensor_buft_overrides_str += tensor_buft_overrides[i].pattern;
tensor_buft_overrides_str += "=";
tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft);
}
if (i + 2 < tensor_buft_overrides.size()) {
tensor_buft_overrides_str += ";";
}
}
}
std::vector<std::string> values = { build_commit,
std::to_string(build_number),
cpu_info,
@ -1037,10 +1219,12 @@ struct test {
std::to_string(no_kv_offload),
std::to_string(flash_attn),
tensor_split_str,
tensor_buft_overrides_str,
std::to_string(use_mmap),
std::to_string(embeddings),
std::to_string(n_prompt),
std::to_string(n_gen),
std::to_string(n_depth),
test_time,
std::to_string(avg_ns()),
std::to_string(stdev_ns()),
@ -1218,7 +1402,7 @@ struct markdown_printer : public printer {
return 4;
}
if (field == "test") {
return 13;
return 15;
}
int width = std::max((int) field.length(), 10);
@ -1254,6 +1438,9 @@ struct markdown_printer : public printer {
if (field == "tensor_split") {
return "ts";
}
if (field == "tensor_buft_overrides") {
return "ot";
}
return field;
}
@ -1307,6 +1494,9 @@ struct markdown_printer : public printer {
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
fields.emplace_back("tensor_split");
}
if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
fields.emplace_back("tensor_buft_overrides");
}
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
fields.emplace_back("use_mmap");
}
@ -1362,6 +1552,10 @@ struct markdown_printer : public printer {
} else {
snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
}
if (t.n_depth > 0) {
int len = strlen(buf);
snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
}
value = buf;
} else if (field == "t/s") {
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
@ -1578,7 +1772,7 @@ int main(int argc, char ** argv) {
test t(inst, lmodel, ctx);
llama_kv_cache_clear(ctx);
llama_kv_self_clear(ctx);
// cool off before the test
if (params.delay) {
@ -1618,7 +1812,15 @@ int main(int argc, char ** argv) {
}
for (int i = 0; i < params.reps; i++) {
llama_kv_cache_clear(ctx);
llama_kv_self_clear(ctx);
if (t.n_depth > 0) {
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
i + 1, params.reps);
}
test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
}
uint64_t t_start = get_time_ns();

View File

@ -18,6 +18,7 @@ android {
}
externalNativeBuild {
cmake {
arguments += "-DLLAMA_CURL=OFF"
arguments += "-DLLAMA_BUILD_COMMON=ON"
arguments += "-DGGML_LLAMAFILE=OFF"
arguments += "-DCMAKE_BUILD_TYPE=Release"

View File

@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
}
batch->logits[batch->n_tokens - 1] = true;
llama_kv_cache_clear(context);
llama_kv_self_clear(context);
const auto t_pp_start = ggml_time_us();
if (llama_decode(context, *batch) != 0) {
@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
LOGi("Benchmark text generation (tg)");
llama_kv_cache_clear(context);
llama_kv_self_clear(context);
const auto t_tg_start = ggml_time_us();
for (i = 0; i < tg; i++) {
@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
const auto t_tg_end = ggml_time_us();
llama_kv_cache_clear(context);
llama_kv_self_clear(context);
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@ -361,7 +361,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
const auto tokens_list = common_tokenize(context, text, true, parse_special);
auto n_ctx = llama_n_ctx(context);
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
auto n_kv_req = tokens_list.size() + n_len;
LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
}

View File

@ -5,6 +5,21 @@ point for more advanced projects.
For usage instructions and performance stats, check the following discussion: https://github.com/ggml-org/llama.cpp/discussions/4508
### Building
First llama.cpp need to be built and a XCFramework needs to be created. This can be done by running
the following script from the llama.cpp project root:
```console
$ ./build-xcframework.sh
```
Open `llama.swiftui.xcodeproj` project in Xcode and you should be able to build and run the app on
a simulator or a real device.
To use the framework with a different project, the XCFramework can be added to the project by
adding `build-apple/llama.xcframework` by dragging and dropping it into the project navigator, or
by manually selecting the framework in the "Frameworks, Libraries, and Embedded Content" section
of the project settings.
![image](https://github.com/ggml-org/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
Video demonstration:

View File

@ -210,7 +210,7 @@ actor LlamaContext {
}
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
llama_kv_cache_clear(context)
llama_kv_self_clear(context)
let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
@ -223,7 +223,7 @@ actor LlamaContext {
// bench text generation
llama_kv_cache_clear(context)
llama_kv_self_clear(context)
let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
@ -242,7 +242,7 @@ actor LlamaContext {
let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
llama_kv_cache_clear(context)
llama_kv_self_clear(context)
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
@ -292,7 +292,7 @@ actor LlamaContext {
func clear() {
tokens_list.removeAll()
temporary_invalid_cchars.removeAll()
llama_kv_cache_clear(context)
llama_kv_self_clear(context)
}
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

View File

@ -7,7 +7,6 @@
objects = {
/* Begin PBXBuildFile section */
1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; };
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
@ -18,9 +17,25 @@
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
DD84C9FD2D747FED007778EC /* llama.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD84C9FC2D747FED007778EC /* llama.xcframework */; };
DD84C9FE2D747FED007778EC /* llama.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = DD84C9FC2D747FED007778EC /* llama.xcframework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
DD84C9FF2D747FED007778EC /* Embed Frameworks */ = {
isa = PBXCopyFilesBuildPhase;
buildActionMask = 2147483647;
dstPath = "";
dstSubfolderSpec = 10;
files = (
DD84C9FE2D747FED007778EC /* llama.xcframework in Embed Frameworks */,
);
name = "Embed Frameworks";
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = "<group>"; };
@ -33,6 +48,7 @@
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
DD84C9FC2D747FED007778EC /* llama.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = llama.xcframework; path = "../../build-apple/llama.xcframework"; sourceTree = "<group>"; };
DF2D2FE72B4A59BE00FCB72D /* llama.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = llama.cpp; path = ../..; sourceTree = "<group>"; };
F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
/* End PBXFileReference section */
@ -42,9 +58,9 @@
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
1809696D2D05A39F00400EE8 /* llama in Frameworks */,
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
DD84C9FD2D747FED007778EC /* llama.xcframework in Frameworks */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@ -86,6 +102,7 @@
8A39BE082AC7601000BFEB40 /* Frameworks */ = {
isa = PBXGroup;
children = (
DD84C9FC2D747FED007778EC /* llama.xcframework */,
549479CA2AC9E16000E0F78B /* Metal.framework */,
8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
);
@ -144,6 +161,7 @@
8A1C836F2AC328BD0096AF73 /* Sources */,
8A1C83702AC328BD0096AF73 /* Frameworks */,
8A1C83712AC328BD0096AF73 /* Resources */,
DD84C9FF2D747FED007778EC /* Embed Frameworks */,
);
buildRules = (
);
@ -151,7 +169,6 @@
);
name = llama.swiftui;
packageProductDependencies = (
1809696C2D05A39F00400EE8 /* llama */,
);
productName = llama.swiftui;
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
@ -427,13 +444,6 @@
defaultConfigurationName = Release;
};
/* End XCConfigurationList section */
/* Begin XCSwiftPackageProductDependency section */
1809696C2D05A39F00400EE8 /* llama */ = {
isa = XCSwiftPackageProductDependency;
productName = llama;
};
/* End XCSwiftPackageProductDependency section */
};
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
}

View File

@ -1,3 +1,5 @@
# llava (legacy)
add_library(llava OBJECT
llava.cpp
llava.h
@ -22,33 +24,53 @@ if (BUILD_SHARED_LIBS)
install(TARGETS llava_shared LIBRARY)
endif()
# mtmd
add_library(mtmd OBJECT
mtmd.cpp
mtmd.h
clip.cpp
clip.h
clip-impl.h
)
target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(mtmd PUBLIC .)
target_include_directories(mtmd PRIVATE ../..)
target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
target_compile_features(mtmd PRIVATE cxx_std_17)
add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
if (BUILD_SHARED_LIBS)
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS mtmd_shared LIBRARY)
endif()
if (NOT MSVC)
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
endif()
if(TARGET BUILD_INFO)
add_dependencies(llava BUILD_INFO)
add_dependencies(mtmd BUILD_INFO)
endif()
set(TARGET llama-llava-cli)
add_executable(${TARGET} llava-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
add_executable(llama-llava-cli deprecation-warning.cpp)
add_executable(llama-gemma3-cli deprecation-warning.cpp)
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
set(TARGET llama-minicpmv-cli)
add_executable(${TARGET} minicpmv-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
set(TARGET llama-mtmd-cli)
add_executable(${TARGET} mtmd-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-qwen2vl-cli)
add_executable(${TARGET} qwen2vl-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-llava-clip-quantize-cli)

View File

@ -1,99 +0,0 @@
## MiniCPM-Llama3-V 2.5
### Prepare models and code
Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
Clone llama.cpp:
```bash
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```
### Usage
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
# quantize int4 version
./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Build for Linux or Mac
```bash
make
make llama-minicpmv-cli
```
Inference on Linux or Mac
```
# run f16 version
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
```
### Android
#### Build on Android device using Termux
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
Install tools in Termux:
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
#### Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```bash
mkdir build-android
cd build-android
export NDK=/your_ndk_path
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
make
```
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
```
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
```
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
```

View File

@ -1,107 +0,0 @@
## MiniCPM-V 2.6
### Prepare models and code
Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
Clone llama.cpp:
```bash
git clone git@github.com:OpenBMB/llama.cpp.git
cd llama.cpp
git checkout minicpmv-main
```
### Usage of MiniCPM-V 2.6
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
# quantize int4 version
./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Build for Linux or Mac
```bash
make
make llama-minicpmv-cli
```
Inference on Linux or Mac
```
# run f16 version
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
```
### Video
Install FFmpeg
```
brew install ffmpeg
brew install pkg-config
```
### Android
#### Build on Android device using Termux
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
Install tools in Termux:
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
#### Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```bash
mkdir build-android
cd build-android
export NDK=/your_ndk_path
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
make
```
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
```
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
```
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
```

View File

@ -1,158 +1,79 @@
# LLaVA
# Multimodal Support in llama.cpp
Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants,
as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants.
This directory provides multimodal capabilities for `llama.cpp`. Initially intended as a showcase for running LLaVA models, its scope has expanded significantly over time to include various other vision-capable models. As a result, LLaVA is no longer the only multimodal architecture supported.
The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
models are available.
For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf)
> [!IMPORTANT]
>
> Multimodal support can be viewed as a sub-project within `llama.cpp`. It is under **very heavy development**, and **breaking changes are expected**.
After API is confirmed, more models will be supported / uploaded.
The naming and structure related to multimodal support have evolved, which might cause some confusion. Here's a brief timeline to clarify:
## Usage
Build with cmake or run `make llama-llava-cli` to build it.
- [#3436](https://github.com/ggml-org/llama.cpp/pull/3436): Initial support for LLaVA 1.5 was added, introducing `llava.cpp` and `clip.cpp`. The `llava-cli` binary was created for model interaction.
- [#4954](https://github.com/ggml-org/llama.cpp/pull/4954): Support for MobileVLM was added, becoming the second vision model supported. This built upon the existing `llava.cpp`, `clip.cpp`, and `llava-cli` infrastructure.
- **Expansion & Fragmentation:** Many new models were subsequently added (e.g., [#7599](https://github.com/ggml-org/llama.cpp/pull/7599), [#10361](https://github.com/ggml-org/llama.cpp/pull/10361), [#12344](https://github.com/ggml-org/llama.cpp/pull/12344), and others). However, `llava-cli` lacked support for the increasingly complex chat templates required by these models. This led to the creation of model-specific binaries like `qwen2vl-cli`, `minicpmv-cli`, and `gemma3-cli`. While functional, this proliferation of command-line tools became confusing for users.
- [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs.
- [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`.
After building, run: `./llama-llava-cli` to see the usage. For example:
## Pre-quantized models
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default:
```sh
./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
# Gemma 3
llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
llama-mtmd-cli -hf ggml-org/gemma-3-12b-it-GGUF
llama-mtmd-cli -hf ggml-org/gemma-3-27b-it-GGUF
# SmolVLM
llama-mtmd-cli -hf ggml-org/SmolVLM-Instruct-GGUF
llama-mtmd-cli -hf ggml-org/SmolVLM-256M-Instruct-GGUF
llama-mtmd-cli -hf ggml-org/SmolVLM-500M-Instruct-GGUF
llama-mtmd-cli -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
llama-mtmd-cli -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
llama-mtmd-cli -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
# Pixtral 12B
llama-mtmd-cli -hf ggml-org/pixtral-12b-GGUF
# Mistral Small 3.1 24B (IQ2_M quantization)
llama-mtmd-cli -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF --chat-template mistral-v7
```
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
## How it works and what is `mmproj`?
## LLaVA 1.5
Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model.
1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
This approach keeps the multimodal components distinct from the core `libllama` library. Separating these allows for faster, independent development cycles. While many modern vision models are based on Vision Transformers (ViTs), their specific pre-processing and projection steps can vary significantly. Integrating this diverse complexity directly into `libllama` is currently challenging.
```sh
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
Consequently, running a multimodal model typically requires two GGUF files:
1. The standard language model file.
2. A corresponding **multimodal projector (`mmproj`)** file, which handles the image encoding and projection.
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
```
## What is `libmtmd`?
2. Install the required Python packages:
As outlined in the history, `libmtmd` is the modern library designed to replace the original `llava.cpp` implementation for handling multimodal inputs.
```sh
pip install -r examples/llava/requirements.txt
```
Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advantages:
- **Unified Interface:** Aims to consolidate interaction for various multimodal models.
- **Improved UX/DX:** Features a more intuitive API, inspired by the `Processor` class in the Hugging Face `transformers` library.
- **Flexibility:** Designed to support multiple input types (text, audio, images) while respecting the wide variety of chat templates used by different models.
3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
## How to obtain `mmproj`
```sh
python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
```
Multimodal projector (`mmproj`) files are specific to each model architecture. Please refer to the relevant guide for instructions on how to obtain or create them:
4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
- [LLaVA](../../docs/multimodal/llava.md)
- [MobileVLM](../../docs/multimodal/MobileVLM.md)
- [GLM-Edge](../../docs/multimodal/glmedge.md)
- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md)
- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md)
- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
- [IBM Granite Vision](../../docs/multimodal/granitevision.md)
- [Google Gemma 3](../../docs/multimodal/gemma3.md)
```sh
python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
```
5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
```
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
## LLaVA 1.6 gguf conversion
1) First clone a LLaVA 1.6 model:
```console
git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
```
2) Install the required Python packages:
```sh
pip install -r examples/llava/requirements.txt
```
3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
```console
python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
```
- you will find a llava.projector and a llava.clip file in your model directory
4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
```console
mkdir vit
cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
cp ../llava-v1.6-vicuna-7b/llava.projector vit/
curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
```
5) Create the visual gguf model:
```console
python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
```
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
6) Then convert the model to gguf format:
```console
python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
```
7) And finally we can run the llava cli using the 1.6 model version:
```console
./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
```
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
**note** if the language model in step `6)` is incompatible with the legacy conversion script, the easiest way handle the LLM model conversion is to load the model in transformers, and export only the LLM from the llava next model.
```python
import os
import transformers
model_path = ...
llm_export_path = ...
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
model = transformers.AutoModelForImageTextToText.from_pretrained(model_path)
tokenizer.save_pretrained(llm_export_path)
model.language_model.save_pretrained(llm_export_path)
```
Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures.
## llava-cli templating and llava-1.6 prompting
llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."`
For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system:
**For Mistral and using llava-cli binary:**
Add this: `-p "<image>\nUSER:\nProvide a full description.\nASSISTANT:\n"`
The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role
**For the 34B this should work:**
Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nProvide a full description.<|im_end|><|im_start|>assistant\n`
## How to know if you are running in llava-1.5 or llava-1.6 mode
When running llava-cli you will see a visual information right before the prompt is being processed:
**Llava-1.5:**
`encode_image_with_clip: image embedding created: 576 tokens`
**Llava-1.6 (anything above 576):**
`encode_image_with_clip: image embedding created: 2880 tokens`
Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6
## TODO
- [x] Support non-CPU backend for the image encoding part.
- [ ] Support different sampling methods.
- [ ] Support more model variants.
For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` flag to get the `mmproj` file:
- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) - Note: 1B variant does not have vision support
- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)

View File

@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
program_dir="build_64/bin"
binName="llama-llava-cli"
binName="llama-mtmd-cli"
n_threads=4

348
examples/llava/clip-impl.h Normal file
View File

@ -0,0 +1,348 @@
#include "ggml.h"
#include "gguf.h"
#include "clip.h"
#include <climits>
#include <cstdarg>
#include <string>
#include <map>
#include <sstream>
#include <vector>
#include <memory>
// Internal header for clip.cpp
#define KEY_FTYPE "general.file_type"
#define KEY_NAME "general.name"
#define KEY_DESCRIPTION "general.description"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_USE_GELU "clip.use_gelu"
#define KEY_USE_SILU "clip.use_silu"
#define KEY_N_EMBD "clip.vision.embedding_length"
#define KEY_N_FF "clip.vision.feed_forward_length"
#define KEY_N_BLOCK "clip.vision.block_count"
#define KEY_N_HEAD "clip.vision.attention.head_count"
#define KEY_LAYER_NORM_EPS "clip.vision.attention.layer_norm_epsilon"
#define KEY_PROJ_DIM "clip.vision.projection_dim"
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
#define KEY_PROJ_TYPE "clip.projector_type"
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
#define KEY_USE_GLU_MLP "clip.use_glu_mlp" // for qwen2.5vl
#define KEY_USE_RMS_NORM "clip.use_rms_norm" // for qwen2.5vl
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
//
// tensor name constants
//
#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
#define TN_LN_1 "%s.blk.%d.ln1.%s"
#define TN_LN_2 "%s.blk.%d.ln2.%s"
#define TN_LN_PRE "%s.pre_ln.%s"
#define TN_LN_POST "%s.post_ln.%s"
#define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
#define TN_IMAGE_NEWLINE "model.image_newline"
#define TN_MM_INP_NORM "mm.input_norm.weight"
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
// mimicpmv
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
#define TN_MINICPMV_QUERY "resampler.query"
#define TN_MINICPMV_PROJ "resampler.proj.weight"
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_MINICPMV,
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_QWEN2VL,
PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_IDEFICS3,
PROJECTOR_TYPE_PIXTRAL,
PROJECTOR_TYPE_QWEN25VL,
PROJECTOR_TYPE_UNKNOWN,
};
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" },
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
{ PROJECTOR_TYPE_MINICPMV, "resampler"},
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
for (const auto & pair : PROJECTOR_TYPE_NAMES) {
if (pair.second == str) {
return pair.first;
}
}
return PROJECTOR_TYPE_UNKNOWN;
}
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
//
// logging
//
static void clip_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
fputs(text, stderr);
fflush(stderr);
}
struct clip_logger_state {
ggml_log_level verbosity_thold;
ggml_log_callback log_callback;
void * log_callback_user_data;
};
extern struct clip_logger_state g_logger_state;
static void clip_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
if (format == NULL) {
return;
}
va_list args_copy;
va_copy(args_copy, args);
char buffer[128];
int len = vsnprintf(buffer, 128, format, args);
if (len < 128) {
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
} else {
char * buffer2 = (char *) calloc(len + 1, sizeof(char));
vsnprintf(buffer2, len + 1, format, args_copy);
buffer2[len] = 0;
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
free(buffer2);
}
va_end(args_copy);
}
static void clip_log_internal(enum ggml_log_level level, const char * format, ...) {
va_list args;
va_start(args, format);
clip_log_internal_v(level, format, args);
va_end(args);
}
#define LOG_TMPL(level, ...) \
do { \
if ((level) >= g_logger_state.verbosity_thold) { \
clip_log_internal((level), __VA_ARGS__); \
} \
} while (0)
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, __VA_ARGS__)
//
// cpp wrappers
//
// wrapper for clip_image_size
struct clip_image_size_deleter {
void operator()(clip_image_size * val) { clip_image_size_free(val); }
};
typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
// wrapper for clip_image_u8
struct clip_image_u8_deleter {
void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
};
typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
// wrapper for clip_image_f32
struct clip_image_f32_deleter {
void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
};
typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
struct clip_image_u8_batch {
std::vector<clip_image_u8_ptr> entries;
};
struct clip_image_f32_batch {
std::vector<clip_image_f32_ptr> entries;
};
//
// common utils
//
static std::string string_format(const char * fmt, ...) {
va_list ap;
va_list ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), buf.size());
}
static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return;
}
std::string builder;
builder.reserve(s.length());
size_t pos = 0;
size_t last_pos = 0;
while ((pos = s.find(search, last_pos)) != std::string::npos) {
builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
}
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
}
// split string by a `std::string delim` instead of `char delim`
static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
std::vector<std::string> tokens;
size_t pos = 0;
std::string token;
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);
return tokens;
}
//
// gguf utils
//
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
switch (type) {
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
default: return string_format("unknown type %d", type);
}
}
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
switch (type) {
case GGUF_TYPE_STRING:
return gguf_get_val_str(ctx_gguf, i);
case GGUF_TYPE_ARRAY:
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {
if (arr_type == GGUF_TYPE_STRING) {
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
// escape quotes
string_replace_all(val, "\\", "\\\\");
string_replace_all(val, "\"", "\\\"");
ss << '"' << val << '"';
} else if (arr_type == GGUF_TYPE_ARRAY) {
ss << "???";
} else {
ss << gguf_data_to_str(arr_type, data, j);
}
if (j < arr_n - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
default:
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
}
}
//
// API used internally with mtmd
//
projector_type clip_get_projector_type(const struct clip_ctx * ctx);

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,7 @@
#ifndef CLIP_H
#define CLIP_H
#include "ggml.h"
#include <stddef.h>
#include <stdint.h>
@ -29,27 +30,28 @@ struct clip_image_size {
int height;
};
struct clip_image_u8_batch {
struct clip_image_u8 * data;
size_t size;
struct clip_image_f32;
struct clip_image_u8_batch;
struct clip_image_f32_batch;
struct clip_context_params {
bool use_gpu;
enum ggml_log_level verbosity;
};
struct clip_image_f32_batch {
struct clip_image_f32 * data;
size_t size;
};
// deprecated, use clip_init
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
// TODO: should be enum, not string
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
@ -57,23 +59,45 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
"use clip_n_output_tokens instead");
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
"use clip_n_output_tokens instead");
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// for M-RoPE, this will be the number of token positions in X and Y directions
// for other models, X will be the total number of tokens and Y will be 1
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// this should be equal to the embedding dimension of the text model
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
CLIP_API struct clip_image_size * clip_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init();
CLIP_API struct clip_image_size * clip_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init();
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
// nx, ny are the output image dimensions
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
// use for accessing underlay data of clip_image_f32_batch
CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
/**
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
@ -98,8 +122,8 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

View File

@ -89,6 +89,7 @@ def bytes_to_unicode():
ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine")
ap.add_argument("--text-only", action="store_true", required=False,
help="Save a text-only model. It can't be used to encode images")
ap.add_argument("--vision-only", action="store_true", required=False,
@ -191,7 +192,7 @@ output_dir = args.output_dir if args.output_dir is not None else dir_model
os.makedirs(output_dir, exist_ok=True)
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
fout = GGUFWriter(path=fname_out, arch="clip")
fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG)
fout.add_bool("clip.has_text_encoder", has_text_encoder)
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)

View File

@ -0,0 +1,22 @@
#include <cstdio>
#include <string>
int main(int argc, char** argv) {
std::string filename = "main";
if (argc >= 1) {
filename = argv[0];
}
// Get only the program name from the full path
size_t pos = filename.find_last_of("/\\");
if (pos != std::string::npos) {
filename = filename.substr(pos+1);
}
fprintf(stdout, "\n");
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n");
fprintf(stdout, "\n");
return EXIT_FAILURE;
}

View File

@ -1,332 +0,0 @@
#include "arg.h"
#include "base64.hpp"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "clip.h"
#include "llava.h"
#include "llama.h"
#include "ggml.h"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
}
return true;
}
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past);
}
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
return true;
}
static const char * sample(struct common_sampler * smpl,
struct llama_context * ctx_llama,
int * n_past) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true);
const llama_model * model = llama_get_model(ctx_llama);
const llama_vocab * vocab = llama_model_get_vocab(model);
static std::string ret;
if (llama_vocab_is_eog(vocab, id)) {
ret = "</s>";
} else {
ret = common_token_to_piece(ctx_llama, id);
}
eval_id(ctx_llama, id, n_past);
return ret.c_str();
}
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
static const char* IMG_BASE64_TAG_END = "\">";
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
}
static bool prompt_contains_image(const std::string& prompt) {
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
return (begin != std::string::npos);
}
// replaces the base64 image tag in the prompt with `replacement`
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL;
}
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
auto required_bytes = base64::required_encode_size(base64_str.size());
auto img_bytes = std::vector<unsigned char>(required_bytes);
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) {
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
return NULL;
}
return embed;
}
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
if (begin == std::string::npos || end == std::string::npos) {
return prompt;
}
auto pre = prompt.substr(0, begin);
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
return pre + replacement + post;
}
struct llava_context {
struct clip_ctx * ctx_clip = NULL;
struct llama_context * ctx_llama = NULL;
struct llama_model * model = NULL;
};
static void print_usage(int, char ** argv) {
LOG("\n example usage:\n");
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
// load and preprocess the image
llava_image_embed * embed = NULL;
auto prompt = params->prompt;
if (prompt_contains_image(prompt)) {
if (!params->image.empty()) {
LOG_INF("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
LOG_ERR("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;
}
}
return embed;
}
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
int n_past = 0;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
std::string system_prompt, user_prompt;
size_t image_pos = prompt.find("<image>");
if (image_pos != std::string::npos) {
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
} else {
// llava-1.5 native mode
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
user_prompt = prompt + "\nASSISTANT:";
if (params->verbose_prompt) {
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
}
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
// generate the response
LOG("\n");
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
if (!smpl) {
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}
std::string response = "";
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
LOG("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
fflush(stdout);
}
common_sampler_free(smpl);
LOG("\n");
}
static struct llama_model * llava_init(common_params * params) {
llama_backend_init();
llama_numa_init(params->numa);
llama_model_params model_params = common_model_params_to_llama(*params);
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
return model;
}
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = common_context_params_to_llama(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL;
}
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip;
ctx_llava->model = model;
return ctx_llava;
}
static void llava_free(struct llava_context * ctx_llava) {
if (ctx_llava->ctx_clip) {
clip_free(ctx_llava->ctx_clip);
ctx_llava->ctx_clip = NULL;
}
llama_free(ctx_llava->ctx_llama);
llama_model_free(ctx_llava->model);
llama_backend_free();
}
int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
return 1;
}
common_init();
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
auto * model = llava_init(&params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
return 1;
}
if (prompt_contains_image(params.prompt)) {
auto * ctx_llava = llava_init_context(&params, model);
auto * image_embed = load_image(ctx_llava, &params, "");
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_perf_context_print(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
} else {
for (auto & image : params.image) {
auto * ctx_llava = llava_init_context(&params, model);
auto * image_embed = load_image(ctx_llava, &params, image);
if (!image_embed) {
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
return 1;
}
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_perf_context_print(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
}
}
llama_model_free(model);
return 0;
}

View File

@ -10,6 +10,7 @@
#include <cstring>
#include <limits>
#include <vector>
#include <memory>
#if defined(LLAVA_LOG_OFF)
# define LOG_INF(...)
@ -45,6 +46,17 @@ struct clip_image_grid_shape {
int second;
};
// convenience cpp wrapper
struct clip_image_f32_batch_deleter {
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
};
typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
struct clip_image_size_deleter {
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
};
typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
/**
* Selects the best resolution from a list of possible resolutions based on the original size.
*
@ -100,13 +112,13 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
}
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
struct {
struct ggml_context * ctx;
} model;
const int32_t image_size = clip_image_size(ctx_clip);
const int32_t patch_size = clip_patch_size(ctx_clip);
const int32_t image_size = clip_get_image_size(ctx_clip);
const int32_t patch_size = clip_get_patch_size(ctx_clip);
int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
@ -163,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
model.ctx = ggml_init(params);
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base
for (size_t i = 1; i < num_images; i++) {
@ -202,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ):
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
// Debug: Test single segments
// Current findings: sending base image, sending a segment embedding all works similar to python
@ -246,12 +258,9 @@ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size)
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
clip_image_f32_batch img_res_v;
img_res_v.size = 0;
img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
clip_image_f32_batch_ptr img_res_v(clip_image_f32_batch_init());
if (!clip_image_preprocess(ctx_clip, img, img_res_v.get())) {
LOG_ERR("%s: unable to preprocess image\n", __func__);
delete[] img_res_v.data;
return false;
}
@ -259,66 +268,72 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
struct clip_image_size * load_image_size = clip_image_size_init();
image_embd_v.resize(n_imgs);
clip_image_size load_image_size;
for (size_t i = 0; i < img_res_v.size; i++) {
for (size_t i = 0; i < n_imgs; i++) {
const int64_t t_img_enc_step_start_us = ggml_time_us();
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
int patch_size=14;
load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size);
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny));
int patch_size = 14;
load_image_size.width = nx;
load_image_size.height = ny;
clip_add_load_image_size(ctx_clip, &load_image_size);
bool encoded = false;
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
if (clip_is_qwen2vl(ctx_clip)) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]);
}
else {
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]);
}
if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
return false;
}
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) {
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
std::memcpy(
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
}
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
free(image_embd_v[i]);
}
image_embd_v.clear();
load_image_size->width = img->nx;
load_image_size->height = img->ny;
clip_add_load_image_size(ctx_clip, load_image_size);
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
delete[] img_res_v.data;
img_res_v.size = 0;
img_res_v.data = nullptr;
load_image_size.width = img->nx;
load_image_size.height = img->ny;
clip_add_load_image_size(ctx_clip, &load_image_size);
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height);
}
else if (clip_is_glm(ctx_clip)){
struct clip_image_size * load_image_size = clip_image_size_init();
load_image_size->width = img_res_v.data[0].nx;
load_image_size->height = img_res_v.data[0].ny;
load_image_size->width = clip_image_f32_batch_nx(img_res_v.get(), 0);
load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0);
clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2);
*n_img_pos = (pos * pos + 2);
if (!encoded){
LOG_ERR("Unable to encode image \n");
@ -327,9 +342,9 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data;
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
if (!encoded) {
LOG_ERR("Unable to encode image\n");
@ -340,17 +355,18 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
// spatial_unpad llava-1.6 type embedding
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
for (size_t i = 0; i < img_res_v.size; i++) {
image_embd_v.resize(n_imgs);
for (size_t i = 0; i < n_imgs; i++) {
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
return false;
}
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip);
const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
@ -360,17 +376,13 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
}
// free all img_res_v - not needed anymore
delete[] img_res_v.data;
img_res_v.size = 0;
img_res_v.data = nullptr;
const int32_t image_size = clip_image_size(ctx_clip);
const int32_t image_size = clip_get_image_size(ctx_clip);
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
int n_img_pos_out;
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {

View File

@ -1,335 +0,0 @@
#include "arg.h"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "clip.h"
#include "llava.h"
#include "llama.h"
#include "ggml.h"
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <iostream> // TODO: remove me
struct llava_context {
struct clip_ctx * ctx_clip = NULL;
struct llama_context * ctx_llama = NULL;
struct llama_model * model = NULL;
};
static void show_additional_info(int /*argc*/, char ** argv) {
LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llama_model * llava_init(common_params * params) {
llama_backend_init();
llama_numa_init(params->numa);
llama_model_params model_params = common_model_params_to_llama(*params);
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
return model;
}
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
llama_context_params ctx_params = common_context_params_to_llama(*params);
if (params->n_ctx < 2048) {
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
ctx_params.n_ctx = 2048;
} else {
ctx_params.n_ctx = params->n_ctx;
}
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL;
}
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->model = model;
return ctx_llava;
}
static void llava_free(struct llava_context * ctx_llava) {
if (ctx_llava->ctx_clip) {
clip_free(ctx_llava->ctx_clip);
ctx_llava->ctx_clip = NULL;
}
llama_free(ctx_llava->ctx_llama);
llama_model_free(ctx_llava->model);
llama_backend_free();
}
static struct clip_ctx * clip_init_context(common_params * params) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
return ctx_clip;
}
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
}
return true;
}
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past);
}
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
}
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
slice_embed->embed = image_embed;
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
llava_image_embed_free(slice_embed);
}
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
std::string system_prompt;
int idx = 0;
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (has_minicpmv_projector == 2) {
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
}
else if (has_minicpmv_projector == 3) {
system_prompt = "<|im_start|>user\n";
}
else if (has_minicpmv_projector == 4) {
system_prompt = "<|im_start|>user\n";
}
LOG_INF("%s: image token past: %d\n", __func__, n_past);
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
if (num_image_embeds > 1) {
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
for (size_t j = 0; j < num_image_embeds_col; ++j) {
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
if (j == num_image_embeds_col - 1) {
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
}
}
}
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
}
LOG_INF("%s: image token past: %d\n", __func__, n_past);
}
static const char * sample(struct common_sampler * smpl,
struct llama_context * ctx_llama,
int * n_past) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true);
const llama_model * model = llama_get_model(ctx_llama);
const llama_vocab * vocab = llama_model_get_vocab(model);
static std::string ret;
if (llama_vocab_is_eog(vocab, id)) {
ret = "</s>";
} else {
ret = common_token_to_piece(ctx_llama, id);
}
eval_id(ctx_llama, id, n_past);
return ret.c_str();
}
static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
auto * ctx_clip = clip_init_context(params);
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embeds) {
LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
return NULL;
}
// process the prompt
if (params->prompt.empty() && params->interactive == false) {
LOG_ERR("prompt should be given or interactive mode should be on");
return NULL;
}
auto * model = llava_init(params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
return NULL;
}
const int64_t t_llava_init_start_us = ggml_time_us();
auto * ctx_llava = llava_init_context(params, model);
ctx_llava->ctx_clip = ctx_clip;
const int64_t t_llava_init_end_us = ggml_time_us();
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
const int64_t t_process_image_start_us = ggml_time_us();
process_image(ctx_llava, embeds, params, n_past);
const int64_t t_process_image_end_us = ggml_time_us();
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
llava_image_embed_free(embeds);
return ctx_llava;
}
static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
std::string user_prompt = prompt;
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (!is_first) {
if (has_minicpmv_projector == 2) {
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
}
else if (has_minicpmv_projector == 3) {
user_prompt = "<|im_start|>user\n" + prompt;
}
else if (has_minicpmv_projector == 4) {
user_prompt = "<|im_start|>user\n" + prompt;
}
}
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
if (has_minicpmv_projector == 2) {
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
}
else if (has_minicpmv_projector == 3) {
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
}
else if (has_minicpmv_projector == 4) {
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
}
// generate the response
LOG_INF("\n");
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
return smpl;
}
static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
return tmp;
}
int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
return 1;
}
common_init();
if (params.mmproj.empty() || (params.image.empty())) {
show_additional_info(argc, argv);
return 1;
}
for (auto & image : params.image) {
int n_past = 0;
auto * ctx_llava = minicpmv_init(&params, image, n_past);
if (!params.prompt.empty()) {
LOG("<user>%s\n", params.prompt.c_str());
LOG("<assistant>");
auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
std::string response;
bool have_tmp = false;
for (int i = 0; i < max_tgt_len; i++) {
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0){
if (!have_tmp) {
continue;
}
break;
}
if (strstr(tmp, "###")) break; // Yi-VL behavior
have_tmp = true;
printf("%s", tmp);
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
fflush(stdout);
}
common_sampler_free(smpl);
}else {
while (true) {
LOG("<user>");
std::string prompt;
std::getline(std::cin, prompt);
LOG("<assistant>");
auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
std::string response;
for (int i = 0; i < max_tgt_len; i++) {
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
printf("%s", tmp);// mistral llava-1.6
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
fflush(stdout);
}
common_sampler_free(smpl);
}
}
printf("\n");
llama_perf_context_print(ctx_llava->ctx_llama);
ctx_llava->model = NULL;
llava_free(ctx_llava);
}
return 0;
}

View File

@ -597,7 +597,6 @@ elif args.minicpmv_projector is not None:
fname_middle = "mmproj-"
has_text_encoder = False
has_minicpmv_projector = True
minicpmv_version = 4
elif args.vision_only:
fname_middle = "vision-"
has_text_encoder = False

345
examples/llava/mtmd-cli.cpp Normal file
View File

@ -0,0 +1,345 @@
#include "arg.h"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "llama.h"
#include "ggml.h"
#include "console.h"
#include "chat.h"
#include "mtmd.h"
#include <vector>
#include <limits.h>
#include <cinttypes>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
// volatile, because of signal being an interrupt
static volatile bool g_is_generating = false;
static volatile bool g_is_interrupted = false;
/**
* Please note that this is NOT a production-ready stuff.
* It is a playground for trying multimodal support in llama.cpp.
* For contributors: please keep this code simple and easy to understand.
*/
static void show_additional_info(int /*argc*/, char ** argv) {
LOG(
"Experimental CLI for multimodal\n\n"
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
" -m and --mmproj are required\n"
" -hf user/repo can replace both -m and --mmproj in most cases\n"
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n"
" to disable using GPU for mmproj model, add --no-mmproj-offload\n",
argv[0]
);
}
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
static void sigint_handler(int signo) {
if (signo == SIGINT) {
if (g_is_generating) {
g_is_generating = false;
} else {
console::cleanup();
if (g_is_interrupted) {
_exit(1);
}
g_is_interrupted = true;
}
}
}
#endif
struct mtmd_cli_context {
mtmd_context_ptr ctx_vision;
common_init_result llama_init;
llama_model * model;
llama_context * lctx;
const llama_vocab * vocab;
llama_batch batch;
int n_batch;
// note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
// so here we don't need to keep track of chat history
common_chat_templates_ptr tmpls;
// support for legacy templates (models not having EOT token)
llama_tokens antiprompt_tokens;
int n_threads = 1;
llama_pos n_past = 0;
mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
model = llama_init.model.get();
lctx = llama_init.context.get();
vocab = llama_model_get_vocab(model);
n_threads = params.cpuparams.n_threads;
batch = llama_batch_init(params.n_batch, 0, 1);
n_batch = params.n_batch;
if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
LOG_ERR("Model does not have chat template.\n");
LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");
LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n");
LOG_ERR(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n");
exit(1);
}
tmpls = common_chat_templates_init(model, params.chat_template);
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja).c_str());
init_vision_context(params);
// load antiprompt tokens for legacy templates
if (params.chat_template == "vicuna") {
antiprompt_tokens = common_tokenize(lctx, "ASSISTANT:", false, true);
} else if (params.chat_template == "deepseek") {
antiprompt_tokens = common_tokenize(lctx, "###", false, true);
}
}
void init_vision_context(common_params & params) {
const char * clip_path = params.mmproj.path.c_str();
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
/* use_gpu */ params.mmproj_use_gpu,
/* timings */ true,
/* n_threads */ params.cpuparams.n_threads,
/* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
}));
if (!ctx_vision.get()) {
LOG_ERR("Failed to load vision model from %s\n", clip_path);
exit(1);
}
}
bool check_antiprompt(const llama_tokens & generated_tokens) {
if (antiprompt_tokens.empty() || generated_tokens.size() < antiprompt_tokens.size()) {
return false;
}
return std::equal(
generated_tokens.end() - antiprompt_tokens.size(),
generated_tokens.end(),
antiprompt_tokens.begin()
);
}
};
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
llama_tokens generated_tokens;
for (int i = 0; i < n_predict; i++) {
if (i > n_predict || !g_is_generating || g_is_interrupted) {
printf("\n");
break;
}
llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
generated_tokens.push_back(token_id);
common_sampler_accept(smpl, token_id, true);
if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
printf("\n");
break; // end of generation
}
printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
fflush(stdout);
if (g_is_interrupted) {
printf("\n");
break;
}
// eval the token
common_batch_clear(ctx.batch);
common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
if (llama_decode(ctx.lctx, ctx.batch)) {
LOG_ERR("failed to decode token\n");
return 1;
}
}
return 0;
}
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
std::vector<mtmd_bitmap> bitmaps;
common_chat_templates_inputs tmpl_inputs;
tmpl_inputs.messages = {msg};
tmpl_inputs.add_generation_prompt = true;
tmpl_inputs.use_jinja = false; // jinja is buggy here
auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
for (auto & fname : images_fname) {
mtmd_bitmap bitmap;
if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)) {
LOG_ERR("Unable to load image %s\n", fname.c_str());
return 2; // image not found
}
bitmaps.push_back(std::move(bitmap));
}
mtmd_input_text text;
text.text = formatted_chat.prompt;
text.add_special = add_bos;
text.parse_special = true;
mtmd_input_chunks chunks;
if (g_is_interrupted) return 0;
int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, bitmaps);
if (res != 0) {
LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
return 1;
}
if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
LOG_ERR("Unable to eval prompt\n");
return 1;
}
ctx.n_past += mtmd_helper_get_n_pos(chunks);
return 0;
}
int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
params.sampling.temp = 0.2; // lower temp by default for better quality
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
return 1;
}
common_init();
if (params.mmproj.path.empty()) {
show_additional_info(argc, argv);
LOG_ERR("ERR: Missing --mmproj argument\n");
return 1;
}
mtmd_cli_context ctx(params);
printf("%s: %s\n", __func__, params.model.path.c_str());
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
// ctrl+C handling
{
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = sigint_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
}
if (g_is_interrupted) return 130;
if (is_single_turn) {
g_is_generating = true;
if (params.prompt.find("<__image__>") == std::string::npos) {
params.prompt += " <__image__>";
}
common_chat_msg msg;
msg.role = "user";
msg.content = params.prompt;
if (eval_message(ctx, msg, params.image, true)) {
return 1;
}
if (!g_is_interrupted && generate_response(ctx, smpl, n_predict)) {
return 1;
}
} else {
LOG("\n Running in chat mode, available commands:");
LOG("\n /image <path> load an image");
LOG("\n /clear clear the chat history");
LOG("\n /quit or /exit exit the program");
LOG("\n");
bool is_first_msg = true;
std::vector<std::string> images_fname;
std::string content;
while (!g_is_interrupted) {
g_is_generating = false;
LOG("\n> ");
console::set_display(console::user_input);
std::string line;
console::readline(line, false);
if (g_is_interrupted) break;
console::set_display(console::reset);
line = string_strip(line);
if (line.empty()) {
continue;
}
if (line == "/quit" || line == "/exit") {
break;
}
if (line == "/clear") {
ctx.n_past = 0;
llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
LOG("Chat history cleared\n\n");
continue;
}
g_is_generating = true;
if (line.find("/image") == 0) {
std::string image = line.substr(7);
images_fname.push_back(string_strip(image));
content += "<__image__>";
continue;
} else {
content += line;
}
common_chat_msg msg;
msg.role = "user";
msg.content = content;
int ret = eval_message(ctx, msg, images_fname, is_first_msg);
if (g_is_interrupted) break;
if (ret == 2) {
// non-fatal error
images_fname.clear();
content.clear();
continue;
}
if (ret) {
return 1;
}
if (generate_response(ctx, smpl, n_predict)) {
return 1;
}
images_fname.clear();
content.clear();
is_first_msg = false;
}
}
if (g_is_interrupted) LOG("\nInterrupted by user\n");
LOG("\n\n");
llama_perf_context_print(ctx.lctx);
return g_is_interrupted ? 130 : 0;
}

708
examples/llava/mtmd.cpp Normal file
View File

@ -0,0 +1,708 @@
#include "clip.h"
#include "clip-impl.h"
#include "mtmd.h"
#include "llama.h"
#include <algorithm>
#include <cerrno>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <vector>
// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
// models not having it (llava-1.6) will process embeddings without any special tokens in-between
enum mtmd_slice_tmpl {
MTMD_SLICE_TMPL_NONE,
MTMD_SLICE_TMPL_MINICPMV_2_5,
MTMD_SLICE_TMPL_MINICPMV_2_6,
// TODO @ngxson : add support for idefics (SmolVLM)
};
struct mtmd_context {
struct clip_ctx * ctx_clip;
const struct llama_model * text_model;
std::vector<float> image_embd_v; // image embedding vector
bool print_timings;
int n_threads;
std::string image_marker;
// for minicpmv, we need special tokens in-between slices
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
// TODO @ngxson : add timings
mtmd_context(const char * mmproj_fname,
const llama_model * text_model,
const mtmd_context_params & ctx_params) :
text_model (text_model),
print_timings(ctx_params.print_timings),
n_threads (ctx_params.n_threads),
image_marker (ctx_params.image_marker)
{
clip_context_params ctx_clip_params;
ctx_clip_params.use_gpu = ctx_params.use_gpu;
ctx_clip_params.verbosity = ctx_params.verbosity;
ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
if (!ctx_clip) {
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
}
use_mrope = clip_is_qwen2vl(ctx_clip);
int minicpmv_version = clip_is_minicpmv(ctx_clip);
if (minicpmv_version == 2) {
// minicpmv 2.5 format:
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
tok_ov_img_start = lookup_token("<image>");
tok_ov_img_end = lookup_token("</image>");
tok_slices_start = lookup_token("<slice>");
tok_slices_end = lookup_token("</slice>");
tok_sli_img_start = tok_ov_img_start;
tok_sli_img_end = tok_ov_img_end;
tok_row_end = lookup_token("\n");
} else if (minicpmv_version == 3 || minicpmv_version == 4) {
// minicpmv 2.6 format:
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
tok_ov_img_start = lookup_token("<image>");
tok_ov_img_end = lookup_token("</image>");
tok_sli_img_start = lookup_token("<slice>");
tok_sli_img_end = lookup_token("</slice>");
tok_row_end = lookup_token("\n");
} else if (minicpmv_version != 0) {
GGML_ASSERT(false && "unsupported minicpmv version");
}
}
~mtmd_context() {
clip_free(ctx_clip);
}
private:
llama_token lookup_token(const std::string & token_text) {
const llama_vocab * vocab = llama_model_get_vocab(text_model);
const int n_vocab = llama_vocab_n_tokens(vocab);
for (int i = 0; i < n_vocab; i++) {
if (token_to_piece(vocab, i, true) == token_text) {
return i;
}
}
return LLAMA_TOKEN_NULL;
}
std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
std::string piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
if (n_chars < 0) {
piece.resize(-n_chars);
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
GGML_ASSERT(check == -n_chars);
} else {
piece.resize(n_chars);
}
return piece;
}
};
struct mtmd_image_tokens_data {
clip_image_f32_batch batch_f32; // preprocessed image patches
};
struct mtmd_image_tokens {
uint32_t nx; // number of tokens in x direction
uint32_t ny; // number of tokens in y direction
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
uint32_t n_tokens() const { return nx * ny; }
clip_image_f32_batch batch_f32; // preprocessed image patches
std::string id; // optional user-defined ID, useful for KV cache tracking
};
mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
const struct llama_model * text_model,
const struct mtmd_context_params ctx_params) {
try {
return new mtmd_context(mmproj_fname, text_model, ctx_params);
} catch (const std::exception & e) {
LOG_ERR("%s: error: %s\n", __func__, e.what());
return nullptr;
}
}
void mtmd_free(mtmd_context * ctx) {
if (ctx) {
delete ctx;
}
}
// copied from common_tokenize
static std::vector<llama_token> mtmd_tokenize_text_internal(
const struct llama_vocab * vocab,
const std::string & text,
bool add_special,
bool parse_special) {
// upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return result;
}
int32_t mtmd_tokenize(mtmd_context * ctx,
std::vector<mtmd_input_chunk> & output,
const mtmd_input_text & text,
const std::vector<mtmd_bitmap> & bitmaps) {
auto vocab = llama_model_get_vocab(ctx->text_model);
std::string prompt_modified(text.text);
std::string marker_modified(ctx->image_marker);
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
// a bit hacky here, but works for now
// for some models, we need to add prefix and suffix to the image embeddings
if (clip_is_gemma3(ctx->ctx_clip)) {
// gemma 3
// <start_of_image> ... (image embeddings) ... <end_of_image>
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
// <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
marker_modified = ctx->image_marker + "[IMG_END]";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
}
else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
}
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
output.clear();
output.reserve(parts.size());
size_t i_img = 0;
// utility for adding raw tokens
auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_TEXT,
std::move(tokens),
{},
};
output.emplace_back(std::move(chunk));
};
// utility for splitting batch of multiple images into chunks of batch having single images
auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
std::vector<mtmd_input_chunk> chunks;
for (auto & entry : batch_f32.entries) {
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
image_tokens->ny = 1;
image_tokens->batch_f32.entries.push_back(std::move(entry));
image_tokens->id = id;
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{},
std::move(image_tokens),
};
chunks.emplace_back(std::move(chunk));
}
return chunks;
};
for (const auto & part : parts) {
// printf("tokenizing part: %s\n", part.c_str());
bool add_bos = &parts.front() == &part;
auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
if (tokens.empty()) {
continue;
}
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_TEXT,
std::move(tokens),
{},
};
output.emplace_back(std::move(chunk));
if (&parts.back() != &part) {
// add image token to middle of 2 parts
if (i_img >= bitmaps.size()) {
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
return 1;
}
// convert mtmd_bitmap to clip_image_u8
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->nx = bitmaps[i_img].nx;
img_u8->ny = bitmaps[i_img].ny;
img_u8->buf.resize(bitmaps[i_img].data.size());
std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
// preprocess image
clip_image_f32_batch batch_f32;
bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
if (!ok) {
LOG_ERR("Unable to preprocess image\n");
return 2;
}
if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
// split batch into chunks of single images
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img].id);
GGML_ASSERT(chunks.size() > 0);
// add overview image
add_text_chunk({ctx->tok_ov_img_start});
output.emplace_back(std::move(chunks.front()));
chunks.erase(chunks.begin());
add_text_chunk({ctx->tok_ov_img_end});
// add slices
if (!chunks.empty()) {
clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
int n_row = (int)chunks.size() / n_col;
GGML_ASSERT(n_row * n_col == (int)chunks.size());
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_slices_start});
}
for (int y = 0; y < n_row; y++) {
for (int x = 0; x < n_col; x++) {
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_sli_img_start});
}
output.emplace_back(std::move(chunks[y * n_col + x]));
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_sli_img_end});
}
}
if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
add_text_chunk({ctx->tok_row_end});
}
}
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_slices_end});
}
}
} else {
size_t n_tokens = 0;
for (const auto & entry : batch_f32.entries) {
n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
}
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
if (ctx->use_mrope) {
// for Qwen2VL, we need this information for M-RoPE decoding positions
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
image_tokens->use_mrope_pos = true;
} else {
// other models, we only need the total number of tokens
image_tokens->nx = n_tokens;
image_tokens->ny = 1;
}
image_tokens->batch_f32 = std::move(batch_f32);
image_tokens->id = bitmaps[i_img].id; // optional
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{},
std::move(image_tokens),
};
output.emplace_back(std::move(chunk));
}
i_img++; // move to next image
}
}
return 0;
}
void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
if (image_tokens) {
delete image_tokens;
}
}
size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
return image_tokens->n_tokens();
}
size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
return image_tokens->nx;
}
size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
return image_tokens->ny;
}
std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
return image_tokens->id;
}
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
if (image_tokens->use_mrope_pos) {
return 1; // for M-RoPE, the whole image is 1 in temporal dimension
}
return image_tokens->n_tokens();
}
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
bool ok = false;
// only effective for minicpmv and qwen2vl, other models will ignore load_image_size
{
clip_image_size slice_size{
image_tokens->batch_f32.entries[0]->nx,
image_tokens->batch_f32.entries[0]->ny};
clip_add_load_image_size(ctx->ctx_clip, &slice_size);
}
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries;
for (size_t i = 0; i < entries.size(); i++) {
int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
ok = clip_image_encode(
ctx->ctx_clip,
ctx->n_threads,
entries[i].get(),
ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
}
} else {
ok = clip_image_batch_encode(
ctx->ctx_clip,
ctx->n_threads,
&image_tokens->batch_f32,
ctx->image_embd_v.data());
}
return ok ? 0 : 1;
}
float * mtmd_get_output_embd(mtmd_context * ctx) {
return ctx->image_embd_v.data();
}
size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
size_t n_tokens = 0;
for (auto & chunk : chunks) {
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
n_tokens += chunk.tokens_text.size();
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
} else {
GGML_ASSERT(false && "chunk type not supported");
}
}
return n_tokens;
}
llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) {
llama_pos n_pos = 0;
for (auto & chunk : chunks) {
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
n_pos += chunk.tokens_text.size();
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
} else {
GGML_ASSERT(false && "chunk type not supported");
}
}
return n_pos;
}
// helper struct to make working with embd batch easier
// note: this will be removed after llama_batch_ext refactoring
struct decode_embd_batch {
int n_pos_per_embd;
int n_mmproj_embd;
std::vector<llama_pos> pos;
std::vector<llama_pos> pos_view; // used by mrope
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens * n_pos_per_embd);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
}
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
seq_id_0[0] = seq_id;
for (int i = 0; i < batch.n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
seq_id_0[0] = seq_id;
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
int i = y * nx + x;
pos[i ] = pos_0;
pos[i + batch.n_tokens ] = pos_0 + y;
pos[i + batch.n_tokens * 2] = pos_0 + x;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
}
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
llama_batch get_view(int offset, int n_tokens) {
llama_pos * pos_ptr;
pos_view.clear();
pos_view.resize(n_tokens * n_pos_per_embd);
if (n_pos_per_embd > 1) {
// mrope
// for example, with layout of src: 1234...1234...1234...1234...
// offset 2 will give us dst: 34...34...34...34...
for (int i = 0; i < n_pos_per_embd; i++) {
auto src = pos.begin() + i * batch.n_tokens + offset;
pos_view.insert(pos_view.end(), src, src + n_tokens);
}
pos_ptr = pos_view.data();
} else {
// normal
pos_ptr = pos.data() + offset;
}
return {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ batch.embd + offset * n_mmproj_embd,
/*pos =*/ pos_ptr,
/*n_seq_id =*/ batch.n_seq_id + offset,
/*seq_id =*/ batch.seq_id + offset,
/*logits =*/ batch.logits + offset,
};
}
};
int32_t mtmd_helper_eval(mtmd_context * ctx,
llama_context * lctx,
mtmd_input_chunks & chunks,
llama_pos pos0,
llama_seq_id seq_id,
int32_t n_batch) {
int32_t ret;
llama_pos n_past = pos0;
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
for (auto & chunk : chunks) {
bool is_last = &chunk == &chunks.back();
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
text_batch.n_tokens = chunk.tokens_text.size();
size_t i = 0;
while (i < chunk.tokens_text.size()) { // split into batches
for (; i < chunk.tokens_text.size() && text_batch.n_tokens < n_batch; i++) {
text_batch.token [i] = chunk.tokens_text[i];
text_batch.pos [i] = n_past++;
text_batch.n_seq_id[i] = 1;
text_batch.seq_id [i][0] = seq_id;
text_batch.logits [i] = false;
}
if (is_last) {
// always get logits for last input chunk
text_batch.logits[text_batch.n_tokens - 1] = true;
}
ret = llama_decode(lctx, text_batch);
if (ret != 0) {
LOG_ERR("failed to decode text\n");
llama_batch_free(text_batch);
return ret;
}
}
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
GGML_ASSERT(chunk.tokens_image != nullptr);
int64_t t0 = ggml_time_ms();
if (ctx->print_timings) {
LOG_INF("encoding image or slice...\n");
}
ret = mtmd_encode(ctx, chunk.tokens_image.get());
if (ret != 0) {
LOG_ERR("failed to encode image\n");
llama_batch_free(text_batch);
return ret;
}
if (ctx->print_timings) {
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
}
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
float * embd = mtmd_get_output_embd(ctx);
decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
if (mtmd_decode_use_mrope(ctx)) {
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
} else {
batch_embd.set_position_normal(n_past, seq_id);
}
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, false);
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
}
while (i_batch < n_img_batches) { // split into batches
int pos_offset = i_batch*n_batch;
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
int64_t t1 = ggml_time_ms();
ret = llama_decode(lctx, batch_embd_view);
if (ret != 0) {
LOG_ERR("failed to decode image\n");
llama_set_causal_attn(lctx, true); // restore causal attn
llama_batch_free(text_batch);
return ret;
}
if (ctx->print_timings) {
LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
}
i_batch++;
}
// for mrope, one image is one single **temporal** position
n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, true);
}
} else {
GGML_ASSERT(false && "chunk type not supported");
}
}
llama_batch_free(text_batch);
return 0;
}
int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output) {
clip_image_u8_ptr img_u8(clip_image_u8_init());
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
if (!ok) {
LOG_ERR("Unable to load image from buffer\n");
return 1;
}
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
output.data.resize(output.nx * output.ny * 3);
std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
return 0;
}
int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
clip_image_u8_ptr img_u8(clip_image_u8_init());
bool ok = clip_image_load_from_file(fname, img_u8.get());
if (!ok) {
LOG_ERR("Unable to load image %s\n", fname);
return 1;
}
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
output.data.resize(output.nx * output.ny * 3);
std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
return 0;
}
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
return true;
}
return false;
}
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
return ctx->use_mrope;
}
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
mtmd_image_tokens_free(val);
}

168
examples/llava/mtmd.h Normal file
View File

@ -0,0 +1,168 @@
#ifndef MTMD_H
#define MTMD_H
#include "ggml.h"
#include "llama.h"
#include "clip.h"
#include <vector>
#include <cinttypes>
#include <memory>
#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define MTMD_API __declspec(dllexport)
# else
# define MTMD_API __declspec(dllimport)
# endif
# else
# define MTMD_API __attribute__ ((visibility ("default")))
# endif
#else
# define MTMD_API
#endif
#ifdef __cplusplus
enum mtmd_input_chunk_type {
MTMD_INPUT_CHUNK_TYPE_TEXT,
MTMD_INPUT_CHUNK_TYPE_IMAGE,
};
struct mtmd_context;
struct mtmd_image_tokens;
// represents raw image data, layout is RGBRGBRGB...
// length of data must be nx * ny * 3
struct mtmd_bitmap {
uint32_t nx;
uint32_t ny;
std::vector<unsigned char> data;
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
};
struct mtmd_image_tokens_deleter {
void operator()(mtmd_image_tokens * val); // forward declaration
};
using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
struct mtmd_input_chunk {
mtmd_input_chunk_type type;
std::vector<llama_token> tokens_text;
mtmd_image_tokens_ptr tokens_image;
};
using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
struct mtmd_context_params {
bool use_gpu = true;
bool print_timings = true;
int n_threads = 4;
enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
const char * image_marker = "<__image__>";
};
struct mtmd_input_text {
std::string text;
bool add_special;
bool parse_special;
};
// initialize the mtmd context
// return nullptr on failure
MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
const llama_model * text_model,
const mtmd_context_params ctx_params);
MTMD_API void mtmd_free(mtmd_context * ctx);
// tokenize an input text prompt and an image
// the prompt must have the input image marker (default: "<__image__>") in it
// the marker will be replaced with the image tokens
// for example:
// "here is an image: <__image__>\ndescribe it in detail."
// this will gives 3 chunks:
// 1. "here is an image: <start_of_image>"
// 2. (image tokens)
// 3. "<end_of_image>\ndescribe it in detail."
// number of bitmaps must be equal to the number of image markers in the prompt
// this function is thread-safe (shared ctx)
// return values:
// 0 on success
// 1 on number of images not matching the number of markers
// 2 on image preprocessing error
MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
std::vector<mtmd_input_chunk> & output,
const mtmd_input_text & text,
const std::vector<mtmd_bitmap> & bitmaps);
// access mtmd_image_tokens
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
MTMD_API llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
// returns 0 on success
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
const mtmd_image_tokens * image_tokens);
// get output embeddings from the last encode pass
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
// whether we need to set non-causal mask before llama_decode
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
// whether the current model use M-RoPE for llama_decode
MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
//
// helper functions (can be implemented based on other functions)
//
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks);
// helper function that automatically:
// 1. run llama_decode() on text chunks
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
// otherwise, returns 0 on success
MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
llama_context * lctx,
mtmd_input_chunks & chunks,
llama_pos pos0,
llama_seq_id seq_id,
int32_t n_batch);
// helper function to construct a mtmd_bitmap from a file
// returns 0 on success
// this function is thread-safe
MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
// helper function to construct a mtmd_bitmap from a buffer
// the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
// returns 0 on success
// this function is thread-safe
MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
// convenient unique_ptr wrappers
struct mtmd_context_deleter {
void operator()(mtmd_context * val) { mtmd_free(val); }
};
using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
#else
static_assert(false && "C header is not yet supported by this library");
#endif
#endif

View File

@ -1,14 +1,16 @@
import argparse
from typing import Dict
from typing import Dict, List, Optional
import torch
import numpy as np
from gguf import *
from transformers import (
Qwen2VLForConditionalGeneration,
Qwen2VLProcessor,
AutoProcessor,
Qwen2VLConfig
Qwen2VLConfig,
Qwen2VLProcessor,
Qwen2VLForConditionalGeneration,
Qwen2_5_VLConfig, # type: ignore[reportAttributeAccessIssue]
Qwen2_5_VLForConditionalGeneration, # type: ignore[reportAttributeAccessIssue]
)
@ -19,61 +21,93 @@ def k(raw_key: str, arch: str) -> str:
return raw_key.format(arch=arch)
def to_gguf_name(name: str) -> str:
og = name
name = name.replace("text_model", "t").replace("vision_model", "v")
name = name.replace("blocks", "blk").replace("embeddings.", "")
name = name.replace("attn.", "attn_")
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
name = name.replace("merger.mlp", 'mm')
print(f"[to_gguf_name] {og} --> {name}")
return name
def get_n_wa_pattern(fullatt_block_indexes: Optional[List[int]]):
if fullatt_block_indexes is None:
return 0
n_wa = fullatt_block_indexes[0]
for a, b in zip(fullatt_block_indexes, fullatt_block_indexes[1:]):
if b - a - 1 != n_wa:
raise ValueError(
f"window/full attention layer should have fix pattern of "
f"for each full-attention layer followed by {n_wa} window-attention layers"
)
return n_wa + 1
def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
vision_model = qwen2vl.visual
tensor_map = {}
for name, ten in vision_model.state_dict().items():
ten = ten.numpy()
if 'qkv' in name:
if ten.ndim == 2: # weight
c3, _ = ten.shape
else: # bias
c3 = ten.shape[0]
assert c3 % 3 == 0
c = c3 // 3
wq = ten[:c]
wk = ten[c: c * 2]
wv = ten[c * 2:]
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
elif 'merger' in name:
if name.endswith("ln_q.weight"):
tensor_map['v.post_ln.weight'] = ten
elif name.endswith("ln_q.bias"):
tensor_map['v.post_ln.bias'] = ten
class VL2:
@staticmethod
def to_gguf_name(name: str) -> str:
og = name
name = name.replace("text_model", "t").replace("vision_model", "v")
name = name.replace("blocks", "blk").replace("embeddings.", "")
name = name.replace("attn.", "attn_")
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
name = name.replace("merger.mlp", 'mm')
print(f"[to_gguf_name] {og} --> {name}")
return name
@classmethod
def find_vision_tensors(cls, qwen2vl, dtype) -> Dict[str, np.ndarray]:
vision_model = qwen2vl.visual
tensor_map = {}
for name, ten in vision_model.state_dict().items():
ten = ten.numpy()
if 'qkv' in name:
if ten.ndim == 2: # weight
c3, _ = ten.shape
else: # bias
c3 = ten.shape[0]
assert c3 % 3 == 0
c = c3 // 3
wq = ten[:c]
wk = ten[c: c * 2]
wv = ten[c * 2:]
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
elif 'merger' in name:
if name.endswith("ln_q.weight"):
tensor_map['v.post_ln.weight'] = ten
elif name.endswith("ln_q.bias"):
tensor_map['v.post_ln.bias'] = ten
else:
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
tensor_map[cls.to_gguf_name(name)] = ten
elif 'patch_embed.proj.weight' in name:
# NOTE: split Conv3D into Conv2Ds
c1, c2, kt, kh, kw = ten.shape
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
else:
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
tensor_map[to_gguf_name(name)] = ten
elif 'patch_embed.proj.weight' in name:
# NOTE: split Conv3D into Conv2Ds
c1, c2, kt, kh, kw = ten.shape
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
else:
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
tensor_map[cls.to_gguf_name(f"vision_model.{name}")] = ten
for new_name, ten in tensor_map.items():
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
tensor_map[new_name] = ten.astype(np.float32)
else:
tensor_map[new_name] = ten.astype(dtype)
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
return tensor_map
for new_name, ten in tensor_map.items():
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
tensor_map[new_name] = ten.astype(np.float32)
else:
tensor_map[new_name] = ten.astype(dtype)
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
return tensor_map
class VL25(VL2):
@staticmethod
def to_gguf_name(name: str) -> str:
og = name
name = name.replace("text_model", "t").replace("vision_model", "v")
name = name.replace("blocks", "blk").replace("embeddings.", "")
name = name.replace("attn.", "attn_")
name = name.replace("mlp.down_proj", "ffn_down").replace("mlp.up_proj", "ffn_up")
name = name.replace("mlp.gate_proj", "ffn_gate").replace("proj.", "out.")
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
name = name.replace("merger.mlp", 'mm')
print(f"[vl25][to_gguf_name] {og} --> {name}")
return name
def main(args):
@ -82,7 +116,7 @@ def main(args):
np_dtype = np.float32
ftype = 0
elif args.data_type == 'fp16':
dtype = torch.float32
dtype = torch.float16
np_dtype = np.float16
ftype = 1
else:
@ -92,11 +126,18 @@ def main(args):
model_path = ""
model_name = args.model_name
print("model_name: ", model_name)
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
model_name, torch_dtype=dtype, device_map="cpu"
)
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
vcfg = cfg.vision_config
if args.model_type == "qwen2vl":
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
model_name, torch_dtype=dtype, device_map="cpu"
)
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
vcfg = cfg.vision_config
else:
qwen2vl = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_name, torch_dtype=dtype, device_map="cpu"
)
cfg: Qwen2_5_VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
vcfg = cfg.vision_config
if os.path.isdir(model_name):
local_model = True
@ -113,7 +154,6 @@ def main(args):
fout.add_bool("clip.has_text_encoder", False)
fout.add_bool("clip.has_vision_encoder", True)
fout.add_bool("clip.has_qwen2vl_merger", True)
fout.add_string("clip.projector_type", "qwen2vl_merger")
print(cfg.vision_config)
if 'silu' in cfg.vision_config.hidden_act.lower():
@ -125,14 +165,25 @@ def main(args):
else:
raise ValueError()
tensor_map = find_vision_tensors(qwen2vl, np_dtype)
if args.model_type == "qwen2.5vl":
fout.add_uint32("clip.vision.n_wa_pattern", get_n_wa_pattern(vcfg.fullatt_block_indexes))
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size)
fout.add_uint32("clip.vision.projection_dim", vcfg.out_hidden_size)
fout.add_string("clip.projector_type", "qwen2.5vl_merger")
else:
fout.add_string("clip.projector_type", "qwen2vl_merger")
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
if args.model_type == "qwen2.5vl":
tensor_map = VL25.find_vision_tensors(qwen2vl, np_dtype)
else:
tensor_map = VL2.find_vision_tensors(qwen2vl, np_dtype)
for name, data in tensor_map.items():
fout.add_tensor(name, data)
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
@ -160,6 +211,7 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
parser.add_argument("--model_type", nargs='?', choices=['qwen2vl', 'qwen2.5vl'], default="qwen2vl")
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
args = parser.parse_args()
main(args)

View File

@ -23,7 +23,12 @@
#include <algorithm>
#include <iostream>
#include <fstream>
#include <limits>
#include <cassert>
#include <cmath>
// THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
// IT IS NOT A PRODUCTION CODE
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
@ -89,20 +94,12 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
int N = (int) tokens.size();
std::vector<llama_pos> pos;
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
auto batch = llama_batch_get_one(&tokens[i], n_eval);
// TODO: add mrope pos ids somewhere else
pos.resize(batch.n_tokens * 4);
std::fill(pos.begin(), pos.end(), 0);
for (int j = 0; j < batch.n_tokens * 3; j ++) {
pos[j] = *st_pos_id + (j % batch.n_tokens);
}
batch.pos = pos.data();
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
@ -314,7 +311,7 @@ static struct llama_model * llava_init(common_params * params) {
llama_model_params model_params = common_model_params_to_llama(*params);
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
@ -323,14 +320,14 @@ static struct llama_model * llava_init(common_params * params) {
}
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
const char * clip_path = params->mmproj.c_str();
const char * clip_path = params->mmproj.path.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
llama_context_params ctx_params = common_context_params_to_llama(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@ -367,14 +364,14 @@ static void debug_test_mrope_2d() {
// 1. Initialize backend
ggml_backend_t backend = NULL;
std::string backend_name = "";
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
backend_name = "cuda";
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#endif
// #ifdef GGML_USE_CUDA
// fprintf(stderr, "%s: using CUDA backend\n", __func__);
// backend = ggml_backend_cuda_init(0); // init device 0
// backend_name = "cuda";
// if (!backend) {
// fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
// }
// #endif
// if there aren't GPU Backends fallback to CPU backend
if (!backend) {
backend = ggml_backend_cpu_init();
@ -483,28 +480,82 @@ static void debug_test_mrope_2d() {
ggml_backend_free(backend);
}
static void debug_dump_img_embed(struct llava_context * ctx_llava) {
int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
int ne = n_embd * 4;
float vals[56 * 56 * 3];
enum model_output_type {
conv3d,
patch_embed,
patch_win_attn_scatter,
first_attn_layer,
last_attn_layer,
attn_softmax,
final_layer,
};
static void debug_dump_img_embed(struct llava_context * ctx_llava, model_output_type output_type) {
constexpr int ih = 140;
constexpr int iw = 196;
// constexpr int ih = 56;
// constexpr int iw = 56;
// int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
int n_embd = 1280;
int merge = 1;
if (output_type == model_output_type::final_layer) {
n_embd = 2048;
merge = 2;
}
else if (output_type == model_output_type::attn_softmax) {
merge = 1;
n_embd = (ih/14/merge) * (iw/14/merge) * 16;
}
int ne = (ih/14/merge) * (iw/14/merge) * n_embd;
float vals[iw * ih * 3];
// float embd[ne];
std::vector<float> embd;
embd.resize(ne);
for (int i = 0; i < 56*56; i++)
for (int i = 0; i < iw*ih; i++)
{
for (int c = 0; c < 3; c++)
vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
vals[i * 3 + c] = (float)i / (iw*ih);
}
clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
clip_encode_float_image(ctx_llava->ctx_clip, 8, vals, ih, iw, embd.data());
std::ofstream outFile("img_embed.bin", std::ios::binary);
std::string file_postfix = "";
switch (output_type)
{
case model_output_type::conv3d:
file_postfix = "conv3d";
break;
case model_output_type::patch_embed:
file_postfix = "patch_embed";
break;
case model_output_type::patch_win_attn_scatter:
file_postfix = "scatter";
break;
case model_output_type::first_attn_layer:
file_postfix = "first_attn";
break;
case model_output_type::last_attn_layer:
file_postfix = "last_attn";
break;
case model_output_type::attn_softmax:
file_postfix = "attn_softmax";
break;
case model_output_type::final_layer:
file_postfix = "final";
break;
default:
break;
}
auto output_path = "img_embed_" + file_postfix + ".bin";
std::ofstream outFile(output_path, std::ios::binary);
if (outFile.is_open()) {
outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
outFile.close();
std::cout << "Data successfully written to mrope.bin" << std::endl;
std::cout << "Data successfully written to ::[ " << output_path << std::endl;
} else {
std::cerr << "Error opening file!" << std::endl;
}
@ -524,7 +575,7 @@ int main(int argc, char ** argv) {
common_init();
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
@ -551,8 +602,9 @@ int main(int argc, char ** argv) {
} else if (params.image[0].empty()) {
auto ctx_llava = llava_init_context(&params, model);
debug_test_mrope_2d();
debug_dump_img_embed(ctx_llava);
// debug_test_mrope_2d();
debug_dump_img_embed(ctx_llava, model_output_type::final_layer);
// debug_dump_img_embed(ctx_llava, model_output_type::last_attn_layer);
llama_perf_context_print(ctx_llava->ctx_llama);
ctx_llava->model = NULL;

BIN
examples/llava/test-1.jpeg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

Some files were not shown because too many files have changed in this diff Show More