mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-27 12:05:03 +00:00
Merge branch 'master' into compilade/mamba2
This commit is contained in:
@ -1,4 +1,4 @@
|
|||||||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
|
||||||
|
|
||||||
## Build Image
|
## Build Image
|
||||||
|
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
# This needs to generally match the container host's environment.
|
# This needs to generally match the container host's environment.
|
||||||
ARG MUSA_VERSION=rc3.1.1
|
ARG MUSA_VERSION=rc4.0.1
|
||||||
# Target the MUSA build image
|
# Target the MUSA build image
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
@ -21,21 +21,14 @@ RUN apt-get update && \
|
|||||||
libcurl4-openssl-dev \
|
libcurl4-openssl-dev \
|
||||||
libgomp1
|
libgomp1
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Use the default MUSA archs if not specified
|
|
||||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||||
fi && \
|
fi && \
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
cmake --build build --config Release -j$(nproc)
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
RUN mkdir -p /app/lib && \
|
||||||
|
@ -21,15 +21,15 @@ indent_style = tab
|
|||||||
[prompts/*.txt]
|
[prompts/*.txt]
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
|
||||||
[examples/server/public/*]
|
[tools/server/public/*]
|
||||||
indent_size = 2
|
indent_size = 2
|
||||||
|
|
||||||
[examples/server/public/deps_*]
|
[tools/server/public/deps_*]
|
||||||
trim_trailing_whitespace = unset
|
trim_trailing_whitespace = unset
|
||||||
indent_style = unset
|
indent_style = unset
|
||||||
indent_size = unset
|
indent_size = unset
|
||||||
|
|
||||||
[examples/server/deps_*]
|
[tools/server/deps_*]
|
||||||
trim_trailing_whitespace = unset
|
trim_trailing_whitespace = unset
|
||||||
indent_style = unset
|
indent_style = unset
|
||||||
indent_size = unset
|
indent_size = unset
|
||||||
@ -37,7 +37,7 @@ indent_size = unset
|
|||||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
[examples/cvector-generator/*.txt]
|
[tools/cvector-generator/*.txt]
|
||||||
trim_trailing_whitespace = unset
|
trim_trailing_whitespace = unset
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
|
||||||
@ -48,3 +48,7 @@ end_of_line = unset
|
|||||||
charset = unset
|
charset = unset
|
||||||
trim_trailing_whitespace = unset
|
trim_trailing_whitespace = unset
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
|
||||||
|
[vendor/miniaudio/miniaudio.h]
|
||||||
|
trim_trailing_whitespace = unset
|
||||||
|
insert_final_newline = unset
|
||||||
|
3
.flake8
3
.flake8
@ -2,8 +2,9 @@
|
|||||||
max-line-length = 125
|
max-line-length = 125
|
||||||
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
|
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
|
||||||
exclude =
|
exclude =
|
||||||
# Do not traverse examples
|
# Do not traverse examples and tools
|
||||||
examples,
|
examples,
|
||||||
|
tools,
|
||||||
# Do not include package initializers
|
# Do not include package initializers
|
||||||
__init__.py,
|
__init__.py,
|
||||||
# No need to traverse our git directory
|
# No need to traverse our git directory
|
||||||
|
22
.github/actions/get-tag-name/action.yml
vendored
Normal file
22
.github/actions/get-tag-name/action.yml
vendored
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
name: "Determine tag name"
|
||||||
|
description: "Determine the tag name to use for a release"
|
||||||
|
outputs:
|
||||||
|
name:
|
||||||
|
description: "The name of the tag"
|
||||||
|
value: ${{ steps.tag.outputs.name }}
|
||||||
|
|
||||||
|
runs:
|
||||||
|
using: "composite"
|
||||||
|
steps:
|
||||||
|
- name: Determine tag name
|
||||||
|
id: tag
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||||
|
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||||
|
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||||
|
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||||
|
else
|
||||||
|
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||||
|
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||||
|
fi
|
67
.github/actions/windows-setup-cuda/action.yml
vendored
Normal file
67
.github/actions/windows-setup-cuda/action.yml
vendored
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
name: "Windows - Setup CUDA Toolkit"
|
||||||
|
description: "Setup CUDA Toolkit for Windows"
|
||||||
|
inputs:
|
||||||
|
cuda_version:
|
||||||
|
description: "CUDA toolkit version"
|
||||||
|
required: true
|
||||||
|
|
||||||
|
runs:
|
||||||
|
using: "composite"
|
||||||
|
steps:
|
||||||
|
- name: Install Cuda Toolkit 11.7
|
||||||
|
if: ${{ inputs.cuda_version == '11.7' }}
|
||||||
|
shell: pwsh
|
||||||
|
run: |
|
||||||
|
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||||
|
choco install unzip -y
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
|
||||||
|
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||||
|
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||||
|
|
||||||
|
- name: Install Cuda Toolkit 12.4
|
||||||
|
if: ${{ inputs.cuda_version == '12.4' }}
|
||||||
|
shell: pwsh
|
||||||
|
run: |
|
||||||
|
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||||
|
choco install unzip -y
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||||
|
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
@ -5,6 +5,10 @@ inputs:
|
|||||||
description: 'CURL version'
|
description: 'CURL version'
|
||||||
required: false
|
required: false
|
||||||
default: '8.6.0_6'
|
default: '8.6.0_6'
|
||||||
|
architecture:
|
||||||
|
description: 'Architecture of the libcurl to download'
|
||||||
|
required: false
|
||||||
|
default: 'win64'
|
||||||
outputs:
|
outputs:
|
||||||
curl_path:
|
curl_path:
|
||||||
description: "Path to the downloaded libcurl"
|
description: "Path to the downloaded libcurl"
|
||||||
@ -18,8 +22,9 @@ runs:
|
|||||||
shell: powershell
|
shell: powershell
|
||||||
env:
|
env:
|
||||||
CURL_VERSION: ${{ inputs.curl_version }}
|
CURL_VERSION: ${{ inputs.curl_version }}
|
||||||
|
ARCHITECTURE: ${{ inputs.architecture }}
|
||||||
run: |
|
run: |
|
||||||
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
|
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
|
||||||
mkdir $env:RUNNER_TEMP/libcurl
|
mkdir $env:RUNNER_TEMP/libcurl
|
||||||
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
|
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
|
||||||
echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
|
echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
|
||||||
|
13
.github/labeler.yml
vendored
13
.github/labeler.yml
vendored
@ -45,7 +45,9 @@ build:
|
|||||||
- CMakePresets.json
|
- CMakePresets.json
|
||||||
examples:
|
examples:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file: examples/**
|
- any-glob-to-any-file:
|
||||||
|
- examples/**
|
||||||
|
- tools/**
|
||||||
devops:
|
devops:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
@ -70,7 +72,7 @@ android:
|
|||||||
server:
|
server:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- examples/server/**
|
- tools/server/**
|
||||||
ggml:
|
ggml:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
@ -84,3 +86,10 @@ nix:
|
|||||||
embedding:
|
embedding:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file: examples/embedding/
|
- any-glob-to-any-file: examples/embedding/
|
||||||
|
|
||||||
|
Ascend NPU:
|
||||||
|
- changed-files:
|
||||||
|
- any-glob-to-any-file:
|
||||||
|
- ggml/include/ggml-cann.h
|
||||||
|
- ggml/src/ggml-cann/**
|
||||||
|
- docs/backend/CANN.md
|
||||||
|
30
.github/workflows/bench.yml.disabled
vendored
30
.github/workflows/bench.yml.disabled
vendored
@ -27,10 +27,10 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '04 2 * * *'
|
- cron: '04 2 * * *'
|
||||||
|
|
||||||
@ -69,7 +69,7 @@ jobs:
|
|||||||
- name: Install python env
|
- name: Install python env
|
||||||
id: pipenv
|
id: pipenv
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/bench
|
cd tools/server/bench
|
||||||
python3 -m venv venv
|
python3 -m venv venv
|
||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
@ -79,7 +79,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
|
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
|
||||||
tar xzf prometheus*.tar.gz --strip-components=1
|
tar xzf prometheus*.tar.gz --strip-components=1
|
||||||
./prometheus --config.file=examples/server/bench/prometheus.yml &
|
./prometheus --config.file=tools/server/bench/prometheus.yml &
|
||||||
while ! nc -z localhost 9090; do
|
while ! nc -z localhost 9090; do
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
done
|
done
|
||||||
@ -92,7 +92,7 @@ jobs:
|
|||||||
- name: Install k6 and xk6-sse
|
- name: Install k6 and xk6-sse
|
||||||
id: k6_installation
|
id: k6_installation
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/bench
|
cd tools/server/bench
|
||||||
go install go.k6.io/xk6/cmd/xk6@latest
|
go install go.k6.io/xk6/cmd/xk6@latest
|
||||||
xk6 build master \
|
xk6 build master \
|
||||||
--with github.com/phymbert/xk6-sse
|
--with github.com/phymbert/xk6-sse
|
||||||
@ -116,7 +116,7 @@ jobs:
|
|||||||
- name: Download the dataset
|
- name: Download the dataset
|
||||||
id: download_dataset
|
id: download_dataset
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/bench
|
cd tools/server/bench
|
||||||
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
- name: Server bench
|
- name: Server bench
|
||||||
@ -126,7 +126,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
set -eux
|
set -eux
|
||||||
|
|
||||||
cd examples/server/bench
|
cd tools/server/bench
|
||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
python bench.py \
|
python bench.py \
|
||||||
--runner-label ${{ env.RUNNER_LABEL }} \
|
--runner-label ${{ env.RUNNER_LABEL }} \
|
||||||
@ -157,9 +157,9 @@ jobs:
|
|||||||
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
||||||
compression-level: 9
|
compression-level: 9
|
||||||
path: |
|
path: |
|
||||||
examples/server/bench/*.jpg
|
tools/server/bench/*.jpg
|
||||||
examples/server/bench/*.json
|
tools/server/bench/*.json
|
||||||
examples/server/bench/*.log
|
tools/server/bench/*.log
|
||||||
|
|
||||||
- name: Commit status
|
- name: Commit status
|
||||||
uses: Sibz/github-status-action@v1
|
uses: Sibz/github-status-action@v1
|
||||||
@ -178,17 +178,17 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
client_id: ${{secrets.IMGUR_CLIENT_ID}}
|
client_id: ${{secrets.IMGUR_CLIENT_ID}}
|
||||||
path: |
|
path: |
|
||||||
examples/server/bench/prompt_tokens_seconds.jpg
|
tools/server/bench/prompt_tokens_seconds.jpg
|
||||||
examples/server/bench/predicted_tokens_seconds.jpg
|
tools/server/bench/predicted_tokens_seconds.jpg
|
||||||
examples/server/bench/kv_cache_usage_ratio.jpg
|
tools/server/bench/kv_cache_usage_ratio.jpg
|
||||||
examples/server/bench/requests_processing.jpg
|
tools/server/bench/requests_processing.jpg
|
||||||
|
|
||||||
- name: Extract mermaid
|
- name: Extract mermaid
|
||||||
id: set_mermaid
|
id: set_mermaid
|
||||||
run: |
|
run: |
|
||||||
set -eux
|
set -eux
|
||||||
|
|
||||||
cd examples/server/bench
|
cd tools/server/bench
|
||||||
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
|
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
|
||||||
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
||||||
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
|
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
|
||||||
|
225
.github/workflows/build-linux-cross.yml
vendored
225
.github/workflows/build-linux-cross.yml
vendored
@ -26,14 +26,15 @@ jobs:
|
|||||||
sudo apt-get install -y --no-install-recommends \
|
sudo apt-get install -y --no-install-recommends \
|
||||||
build-essential \
|
build-essential \
|
||||||
gcc-14-riscv64-linux-gnu \
|
gcc-14-riscv64-linux-gnu \
|
||||||
g++-14-riscv64-linux-gnu \
|
g++-14-riscv64-linux-gnu
|
||||||
libcurl4-openssl-dev:riscv64
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
cmake -B build -DLLAMA_CURL=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
-DGGML_OPENMP=OFF \
|
-DGGML_OPENMP=OFF \
|
||||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
-DCMAKE_SYSTEM_NAME=Linux \
|
-DCMAKE_SYSTEM_NAME=Linux \
|
||||||
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||||
@ -71,15 +72,16 @@ jobs:
|
|||||||
glslc \
|
glslc \
|
||||||
gcc-14-riscv64-linux-gnu \
|
gcc-14-riscv64-linux-gnu \
|
||||||
g++-14-riscv64-linux-gnu \
|
g++-14-riscv64-linux-gnu \
|
||||||
libvulkan-dev:riscv64 \
|
libvulkan-dev:riscv64
|
||||||
libcurl4-openssl-dev:riscv64
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
cmake -B build -DLLAMA_CURL=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
-DGGML_VULKAN=ON \
|
-DGGML_VULKAN=ON \
|
||||||
-DGGML_OPENMP=OFF \
|
-DGGML_OPENMP=OFF \
|
||||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
-DCMAKE_SYSTEM_NAME=Linux \
|
-DCMAKE_SYSTEM_NAME=Linux \
|
||||||
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||||
@ -116,15 +118,16 @@ jobs:
|
|||||||
build-essential \
|
build-essential \
|
||||||
glslc \
|
glslc \
|
||||||
crossbuild-essential-arm64 \
|
crossbuild-essential-arm64 \
|
||||||
libvulkan-dev:arm64 \
|
libvulkan-dev:arm64
|
||||||
libcurl4-openssl-dev:arm64
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
cmake -B build -DLLAMA_CURL=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
-DGGML_VULKAN=ON \
|
-DGGML_VULKAN=ON \
|
||||||
-DGGML_OPENMP=OFF \
|
-DGGML_OPENMP=OFF \
|
||||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
-DCMAKE_SYSTEM_NAME=Linux \
|
-DCMAKE_SYSTEM_NAME=Linux \
|
||||||
-DCMAKE_SYSTEM_PROCESSOR=aarch64 \
|
-DCMAKE_SYSTEM_PROCESSOR=aarch64 \
|
||||||
@ -137,3 +140,207 @@ jobs:
|
|||||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||||
|
|
||||||
cmake --build build --config Release -j $(nproc)
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
ubuntu-24-ppc64el-cpu-cross:
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Setup PowerPC64le
|
||||||
|
run: |
|
||||||
|
sudo dpkg --add-architecture ppc64el
|
||||||
|
|
||||||
|
# Add arch-specific repositories for non-amd64 architectures
|
||||||
|
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
|
||||||
|
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||||
|
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||||
|
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||||
|
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||||
|
EOF
|
||||||
|
|
||||||
|
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||||
|
|
||||||
|
sudo apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
gcc-14-powerpc64le-linux-gnu \
|
||||||
|
g++-14-powerpc64le-linux-gnu
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: |
|
||||||
|
cmake -B build -DLLAMA_CURL=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_OPENMP=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DCMAKE_SYSTEM_NAME=Linux \
|
||||||
|
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
|
||||||
|
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
|
||||||
|
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
|
||||||
|
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||||
|
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
ubuntu-24-ppc64el-vulkan-cross:
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Setup PowerPC64le
|
||||||
|
run: |
|
||||||
|
sudo dpkg --add-architecture ppc64el
|
||||||
|
|
||||||
|
# Add arch-specific repositories for non-amd64 architectures
|
||||||
|
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
|
||||||
|
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||||
|
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||||
|
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||||
|
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||||
|
EOF
|
||||||
|
|
||||||
|
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||||
|
|
||||||
|
sudo apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
glslc \
|
||||||
|
gcc-14-powerpc64le-linux-gnu \
|
||||||
|
g++-14-powerpc64le-linux-gnu \
|
||||||
|
libvulkan-dev:ppc64el
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: |
|
||||||
|
cmake -B build -DLLAMA_CURL=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_VULKAN=ON \
|
||||||
|
-DGGML_OPENMP=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DCMAKE_SYSTEM_NAME=Linux \
|
||||||
|
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
|
||||||
|
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
|
||||||
|
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
|
||||||
|
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||||
|
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
debian-13-loongarch64-cpu-cross:
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Setup LoongArch
|
||||||
|
run: |
|
||||||
|
rm -f /etc/apt/sources.list.d/*
|
||||||
|
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
|
||||||
|
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
|
||||||
|
EOF
|
||||||
|
( echo 'quiet "true";'; \
|
||||||
|
echo 'APT::Get::Assume-Yes "true";'; \
|
||||||
|
echo 'APT::Install-Recommends "false";'; \
|
||||||
|
echo 'Acquire::Check-Valid-Until "false";'; \
|
||||||
|
echo 'Acquire::Retries "5";'; \
|
||||||
|
) > /etc/apt/apt.conf.d/99snapshot-repos
|
||||||
|
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
|
||||||
|
dpkg --add-architecture loong64
|
||||||
|
|
||||||
|
# Add arch-specific repositories for non-amd64 architectures
|
||||||
|
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
|
||||||
|
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
|
||||||
|
EOF
|
||||||
|
|
||||||
|
apt-get update || true ;# Prevent failure due to missing URLs.
|
||||||
|
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
gcc-14-loongarch64-linux-gnu \
|
||||||
|
g++-14-loongarch64-linux-gnu
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: |
|
||||||
|
cmake -B build -DLLAMA_CURL=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_OPENMP=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DCMAKE_SYSTEM_NAME=Linux \
|
||||||
|
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
|
||||||
|
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
|
||||||
|
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
|
||||||
|
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||||
|
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
debian-13-loongarch64-vulkan-cross:
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Setup LoongArch
|
||||||
|
run: |
|
||||||
|
rm -f /etc/apt/sources.list.d/*
|
||||||
|
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
|
||||||
|
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
|
||||||
|
EOF
|
||||||
|
( echo 'quiet "true";'; \
|
||||||
|
echo 'APT::Get::Assume-Yes "true";'; \
|
||||||
|
echo 'APT::Install-Recommends "false";'; \
|
||||||
|
echo 'Acquire::Check-Valid-Until "false";'; \
|
||||||
|
echo 'Acquire::Retries "5";'; \
|
||||||
|
) > /etc/apt/apt.conf.d/99snapshot-repos
|
||||||
|
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
|
||||||
|
dpkg --add-architecture loong64
|
||||||
|
|
||||||
|
# Add arch-specific repositories for non-amd64 architectures
|
||||||
|
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
|
||||||
|
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
|
||||||
|
EOF
|
||||||
|
|
||||||
|
apt-get update || true ;# Prevent failure due to missing URLs.
|
||||||
|
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
glslc \
|
||||||
|
gcc-14-loongarch64-linux-gnu \
|
||||||
|
g++-14-loongarch64-linux-gnu \
|
||||||
|
libvulkan-dev:loong64
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: |
|
||||||
|
cmake -B build -DLLAMA_CURL=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_VULKAN=ON \
|
||||||
|
-DGGML_OPENMP=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DCMAKE_SYSTEM_NAME=Linux \
|
||||||
|
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
|
||||||
|
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
|
||||||
|
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
|
||||||
|
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||||
|
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||||
|
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
811
.github/workflows/build.yml
vendored
811
.github/workflows/build.yml
vendored
File diff suppressed because it is too large
Load Diff
7
.github/workflows/docker.yml
vendored
7
.github/workflows/docker.yml
vendored
@ -36,10 +36,13 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
# Multi-stage build
|
# Multi-stage build
|
||||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
# Note: the arm64 images are failing, which prevents the amd64 images from being built
|
||||||
|
# https://github.com/ggml-org/llama.cpp/issues/11888
|
||||||
|
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||||
|
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
||||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
||||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
|
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
|
||||||
|
749
.github/workflows/release.yml
vendored
Normal file
749
.github/workflows/release.yml
vendored
Normal file
@ -0,0 +1,749 @@
|
|||||||
|
name: Release
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
inputs:
|
||||||
|
create_release:
|
||||||
|
description: 'Create new release'
|
||||||
|
required: true
|
||||||
|
type: boolean
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
|
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
macOS-arm64:
|
||||||
|
runs-on: macos-14
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: macOS-latest-cmake-arm64
|
||||||
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
brew update
|
||||||
|
brew install curl
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
sysctl -a
|
||||||
|
cmake -B build \
|
||||||
|
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||||
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
|
-DGGML_METAL_USE_BF16=ON \
|
||||||
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
|
-DGGML_RPC=ON \
|
||||||
|
${{ env.CMAKE_ARGS }}
|
||||||
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
|
- name: Determine tag name
|
||||||
|
id: tag
|
||||||
|
uses: ./.github/actions/get-tag-name
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
run: |
|
||||||
|
cp LICENSE ./build/bin/
|
||||||
|
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
||||||
|
|
||||||
|
- name: Upload artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
||||||
|
name: llama-bin-macos-arm64.zip
|
||||||
|
|
||||||
|
macOS-x64:
|
||||||
|
runs-on: macos-13
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: macOS-latest-cmake-x64
|
||||||
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
brew update
|
||||||
|
brew install curl
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
sysctl -a
|
||||||
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
|
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
|
cmake -B build \
|
||||||
|
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||||
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
|
-DGGML_METAL=OFF \
|
||||||
|
-DGGML_RPC=ON
|
||||||
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
|
- name: Determine tag name
|
||||||
|
id: tag
|
||||||
|
uses: ./.github/actions/get-tag-name
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
run: |
|
||||||
|
cp LICENSE ./build/bin/
|
||||||
|
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
||||||
|
|
||||||
|
- name: Upload artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||||
|
name: llama-bin-macos-x64.zip
|
||||||
|
|
||||||
|
ubuntu-22-cpu:
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- build: 'x64'
|
||||||
|
os: ubuntu-22.04
|
||||||
|
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
|
||||||
|
# - build: 'arm64'
|
||||||
|
# os: ubuntu-22.04-arm
|
||||||
|
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: ubuntu-cpu-cmake
|
||||||
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_BACKEND_DL=ON \
|
||||||
|
-DGGML_NATIVE=OFF \
|
||||||
|
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||||
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
|
${{ env.CMAKE_ARGS }}
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
- name: Determine tag name
|
||||||
|
id: tag
|
||||||
|
uses: ./.github/actions/get-tag-name
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
run: |
|
||||||
|
cp LICENSE ./build/bin/
|
||||||
|
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
|
||||||
|
|
||||||
|
- name: Upload artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
|
||||||
|
name: llama-bin-ubuntu-${{ matrix.build }}.zip
|
||||||
|
|
||||||
|
ubuntu-22-vulkan:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: ubuntu-22-cmake-vulkan
|
||||||
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
|
||||||
|
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||||
|
sudo apt-get update -y
|
||||||
|
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_BACKEND_DL=ON \
|
||||||
|
-DGGML_NATIVE=OFF \
|
||||||
|
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||||
|
-DGGML_VULKAN=ON \
|
||||||
|
${{ env.CMAKE_ARGS }}
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
- name: Determine tag name
|
||||||
|
id: tag
|
||||||
|
uses: ./.github/actions/get-tag-name
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
run: |
|
||||||
|
cp LICENSE ./build/bin/
|
||||||
|
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
|
||||||
|
|
||||||
|
- name: Upload artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
|
||||||
|
name: llama-bin-ubuntu-vulkan-x64.zip
|
||||||
|
|
||||||
|
windows-cpu:
|
||||||
|
runs-on: windows-latest
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- arch: 'x64'
|
||||||
|
- arch: 'arm64'
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: windows-latest-cmake-cpu-${{ matrix.arch }}
|
||||||
|
variant: ccache
|
||||||
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Install Ninja
|
||||||
|
run: |
|
||||||
|
choco install ninja
|
||||||
|
|
||||||
|
- name: libCURL
|
||||||
|
id: get_libcurl
|
||||||
|
uses: ./.github/actions/windows-setup-curl
|
||||||
|
with:
|
||||||
|
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
shell: cmd
|
||||||
|
env:
|
||||||
|
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||||
|
run: |
|
||||||
|
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
|
||||||
|
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||||
|
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
|
||||||
|
-DGGML_NATIVE=OFF ^
|
||||||
|
-DGGML_BACKEND_DL=ON ^
|
||||||
|
-DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
|
||||||
|
-DGGML_OPENMP=ON ^
|
||||||
|
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
|
||||||
|
${{ env.CMAKE_ARGS }}
|
||||||
|
cmake --build build --config Release
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
env:
|
||||||
|
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||||
|
run: |
|
||||||
|
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
|
||||||
|
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
|
||||||
|
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
|
||||||
|
|
||||||
|
- name: Upload artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-bin-win-cpu-${{ matrix.arch }}.zip
|
||||||
|
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
|
||||||
|
|
||||||
|
windows:
|
||||||
|
runs-on: windows-latest
|
||||||
|
|
||||||
|
env:
|
||||||
|
OPENBLAS_VERSION: 0.3.23
|
||||||
|
VULKAN_VERSION: 1.4.309.0
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- backend: 'vulkan'
|
||||||
|
arch: 'x64'
|
||||||
|
defines: '-DGGML_VULKAN=ON'
|
||||||
|
target: 'ggml-vulkan'
|
||||||
|
- backend: 'opencl-adreno'
|
||||||
|
arch: 'arm64'
|
||||||
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||||
|
target: 'ggml-opencl'
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
|
||||||
|
variant: ccache
|
||||||
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Install Vulkan SDK
|
||||||
|
id: get_vulkan
|
||||||
|
if: ${{ matrix.backend == 'vulkan' }}
|
||||||
|
run: |
|
||||||
|
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||||
|
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||||
|
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||||
|
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||||
|
|
||||||
|
- name: Install Ninja
|
||||||
|
id: install_ninja
|
||||||
|
run: |
|
||||||
|
choco install ninja
|
||||||
|
|
||||||
|
- name: Install OpenCL Headers and Libs
|
||||||
|
id: install_opencl
|
||||||
|
if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
|
||||||
|
run: |
|
||||||
|
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
||||||
|
cd OpenCL-Headers
|
||||||
|
cmake -B build `
|
||||||
|
-DBUILD_TESTING=OFF `
|
||||||
|
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
||||||
|
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
||||||
|
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||||
|
cmake --build build --target install
|
||||||
|
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
|
||||||
|
cd OpenCL-ICD-Loader
|
||||||
|
cmake -B build-arm64-release `
|
||||||
|
-A arm64 `
|
||||||
|
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
|
||||||
|
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||||
|
cmake --build build-arm64-release --target install --config release
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
|
||||||
|
cmake --build build --config Release --target ${{ matrix.target }}
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
run: |
|
||||||
|
7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
|
||||||
|
|
||||||
|
- name: Upload artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||||
|
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||||
|
|
||||||
|
windows-cuda:
|
||||||
|
runs-on: windows-2022
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
cuda: ['12.4']
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: windows-cuda-${{ matrix.cuda }}
|
||||||
|
variant: ccache
|
||||||
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Install Cuda Toolkit
|
||||||
|
uses: ./.github/actions/windows-setup-cuda
|
||||||
|
with:
|
||||||
|
cuda_version: ${{ matrix.cuda }}
|
||||||
|
|
||||||
|
- name: Install Ninja
|
||||||
|
id: install_ninja
|
||||||
|
run: |
|
||||||
|
choco install ninja
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
shell: cmd
|
||||||
|
run: |
|
||||||
|
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||||
|
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||||
|
-DGGML_BACKEND_DL=ON ^
|
||||||
|
-DGGML_NATIVE=OFF ^
|
||||||
|
-DGGML_CPU=OFF ^
|
||||||
|
-DGGML_CUDA=ON ^
|
||||||
|
-DLLAMA_CURL=OFF
|
||||||
|
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||||
|
cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
run: |
|
||||||
|
7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
|
||||||
|
|
||||||
|
- name: Upload artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||||
|
name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||||
|
|
||||||
|
- name: Copy and pack Cuda runtime
|
||||||
|
run: |
|
||||||
|
echo "Cuda install location: ${{ env.CUDA_PATH }}"
|
||||||
|
$dst='.\build\bin\cudart\'
|
||||||
|
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||||
|
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||||
|
7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
|
||||||
|
|
||||||
|
- name: Upload Cuda runtime
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||||
|
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||||
|
|
||||||
|
windows-sycl:
|
||||||
|
runs-on: windows-latest
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
env:
|
||||||
|
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
|
||||||
|
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||||
|
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: windows-latest-cmake-sycl
|
||||||
|
variant: ccache
|
||||||
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Install
|
||||||
|
run: |
|
||||||
|
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
shell: cmd
|
||||||
|
run: |
|
||||||
|
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
cmake -G "Ninja" -B build ^
|
||||||
|
-DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||||
|
-DCMAKE_BUILD_TYPE=Release ^
|
||||||
|
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||||
|
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||||
|
-DLLAMA_CURL=OFF
|
||||||
|
cmake --build build --target ggml-sycl -j
|
||||||
|
|
||||||
|
- name: Build the release package
|
||||||
|
id: pack_artifacts
|
||||||
|
run: |
|
||||||
|
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||||
|
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||||
|
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||||
|
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||||
|
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||||
|
|
||||||
|
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||||
|
7z a llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||||
|
|
||||||
|
- name: Upload the release package
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-bin-win-sycl-x64.zip
|
||||||
|
name: llama-bin-win-sycl-x64.zip
|
||||||
|
|
||||||
|
windows-hip:
|
||||||
|
runs-on: windows-latest
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- name: "radeon"
|
||||||
|
gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Clone rocWMMA repository
|
||||||
|
id: clone_rocwmma
|
||||||
|
run: |
|
||||||
|
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: windows-latest-cmake-hip-${{ matrix.name }}-x64
|
||||||
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Install
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
$ErrorActionPreference = "Stop"
|
||||||
|
write-host "Downloading AMD HIP SDK Installer"
|
||||||
|
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||||
|
write-host "Installing AMD HIP SDK"
|
||||||
|
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||||
|
write-host "Completed AMD HIP SDK installation"
|
||||||
|
|
||||||
|
- name: Verify ROCm
|
||||||
|
id: verify
|
||||||
|
run: |
|
||||||
|
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||||
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||||
|
cmake -G "Unix Makefiles" -B build -S . `
|
||||||
|
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||||
|
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||||
|
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
|
||||||
|
-DCMAKE_BUILD_TYPE=Release `
|
||||||
|
-DGGML_BACKEND_DL=ON `
|
||||||
|
-DGGML_NATIVE=OFF `
|
||||||
|
-DGGML_CPU=OFF `
|
||||||
|
-DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
|
||||||
|
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||||
|
-DGGML_HIP=ON `
|
||||||
|
-DLLAMA_CURL=OFF
|
||||||
|
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
md "build\bin\rocblas\library\"
|
||||||
|
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||||
|
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
|
||||||
|
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
run: |
|
||||||
|
7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
|
||||||
|
|
||||||
|
- name: Upload artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||||
|
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||||
|
|
||||||
|
ios-xcode-build:
|
||||||
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
sysctl -a
|
||||||
|
cmake -B build -G Xcode \
|
||||||
|
-DGGML_METAL_USE_BF16=ON \
|
||||||
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
|
-DLLAMA_BUILD_TOOLS=OFF \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
|
-DCMAKE_SYSTEM_NAME=iOS \
|
||||||
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||||
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||||
|
|
||||||
|
- name: xcodebuild for swift package
|
||||||
|
id: xcodebuild
|
||||||
|
run: |
|
||||||
|
./build-xcframework.sh
|
||||||
|
|
||||||
|
- name: Build Xcode project
|
||||||
|
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||||
|
|
||||||
|
- name: Determine tag name
|
||||||
|
id: tag
|
||||||
|
uses: ./.github/actions/get-tag-name
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
run: |
|
||||||
|
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||||
|
|
||||||
|
- name: Upload artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||||
|
name: llama-${{ steps.tag.outputs.name }}-xcframework
|
||||||
|
|
||||||
|
release:
|
||||||
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
|
||||||
|
# Fine-grant permission
|
||||||
|
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||||
|
permissions:
|
||||||
|
contents: write # for creating release
|
||||||
|
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
needs:
|
||||||
|
- windows
|
||||||
|
- windows-cpu
|
||||||
|
- windows-cuda
|
||||||
|
- windows-sycl
|
||||||
|
- windows-hip
|
||||||
|
- ubuntu-22-cpu
|
||||||
|
- ubuntu-22-vulkan
|
||||||
|
- macOS-arm64
|
||||||
|
- macOS-x64
|
||||||
|
- ios-xcode-build
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Determine tag name
|
||||||
|
id: tag
|
||||||
|
uses: ./.github/actions/get-tag-name
|
||||||
|
|
||||||
|
- name: Download artifacts
|
||||||
|
id: download-artifact
|
||||||
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
path: ./artifact
|
||||||
|
merge-multiple: true
|
||||||
|
|
||||||
|
- name: Move artifacts
|
||||||
|
id: move_artifacts
|
||||||
|
run: |
|
||||||
|
mkdir -p release
|
||||||
|
|
||||||
|
echo "Adding CPU backend files to existing zips..."
|
||||||
|
for arch in x64 arm64; do
|
||||||
|
cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip"
|
||||||
|
temp_dir=$(mktemp -d)
|
||||||
|
echo "Extracting CPU backend for $arch..."
|
||||||
|
unzip "$cpu_zip" -d "$temp_dir"
|
||||||
|
|
||||||
|
echo "Adding CPU files to $arch zips..."
|
||||||
|
for target_zip in artifact/llama-bin-win-*-${arch}.zip; do
|
||||||
|
if [[ "$target_zip" == "$cpu_zip" ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
echo "Adding CPU backend to $(basename "$target_zip")"
|
||||||
|
realpath_target_zip=$(realpath "$target_zip")
|
||||||
|
(cd "$temp_dir" && zip -r "$realpath_target_zip" .)
|
||||||
|
done
|
||||||
|
|
||||||
|
rm -rf "$temp_dir"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Renaming and moving zips to release..."
|
||||||
|
for zip_file in artifact/llama-bin-win-*.zip; do
|
||||||
|
base_name=$(basename "$zip_file" .zip)
|
||||||
|
zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip"
|
||||||
|
echo "Moving $zip_file to release/$zip_name"
|
||||||
|
mv "$zip_file" "release/$zip_name"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Moving other artifacts..."
|
||||||
|
mv -v artifact/*.zip release
|
||||||
|
|
||||||
|
- name: Create release
|
||||||
|
id: create_release
|
||||||
|
uses: ggml-org/action-create-release@v1
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
with:
|
||||||
|
tag_name: ${{ steps.tag.outputs.name }}
|
||||||
|
|
||||||
|
- name: Upload release
|
||||||
|
id: upload_release
|
||||||
|
uses: actions/github-script@v3
|
||||||
|
with:
|
||||||
|
github-token: ${{secrets.GITHUB_TOKEN}}
|
||||||
|
script: |
|
||||||
|
const path = require('path');
|
||||||
|
const fs = require('fs');
|
||||||
|
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||||
|
for (let file of await fs.readdirSync('./release')) {
|
||||||
|
if (path.extname(file) === '.zip') {
|
||||||
|
console.log('uploadReleaseAsset', file);
|
||||||
|
await github.repos.uploadReleaseAsset({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
release_id: release_id,
|
||||||
|
name: file,
|
||||||
|
data: await fs.readFileSync(`./release/${file}`)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
26
.github/workflows/server.yml
vendored
26
.github/workflows/server.yml
vendored
@ -15,10 +15,10 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
||||||
|
|
||||||
env:
|
env:
|
||||||
LLAMA_LOG_COLORS: 1
|
LLAMA_LOG_COLORS: 1
|
||||||
@ -74,7 +74,7 @@ jobs:
|
|||||||
- name: Tests dependencies
|
- name: Tests dependencies
|
||||||
id: test_dependencies
|
id: test_dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install -r examples/server/tests/requirements.txt
|
pip install -r tools/server/tests/requirements.txt
|
||||||
|
|
||||||
# Setup nodejs (to be used for verifying bundled index.html)
|
# Setup nodejs (to be used for verifying bundled index.html)
|
||||||
- uses: actions/setup-node@v4
|
- uses: actions/setup-node@v4
|
||||||
@ -84,14 +84,14 @@ jobs:
|
|||||||
- name: WebUI - Install dependencies
|
- name: WebUI - Install dependencies
|
||||||
id: webui_lint
|
id: webui_lint
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/webui
|
cd tools/server/webui
|
||||||
npm ci
|
npm ci
|
||||||
|
|
||||||
- name: WebUI - Check code format
|
- name: WebUI - Check code format
|
||||||
id: webui_format
|
id: webui_format
|
||||||
run: |
|
run: |
|
||||||
git config --global --add safe.directory $(realpath .)
|
git config --global --add safe.directory $(realpath .)
|
||||||
cd examples/server/webui
|
cd tools/server/webui
|
||||||
git status
|
git status
|
||||||
|
|
||||||
npm run format
|
npm run format
|
||||||
@ -108,7 +108,7 @@ jobs:
|
|||||||
id: verify_server_index_html
|
id: verify_server_index_html
|
||||||
run: |
|
run: |
|
||||||
git config --global --add safe.directory $(realpath .)
|
git config --global --add safe.directory $(realpath .)
|
||||||
cd examples/server/webui
|
cd tools/server/webui
|
||||||
git status
|
git status
|
||||||
|
|
||||||
npm run build
|
npm run build
|
||||||
@ -161,26 +161,26 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
GITHUB_ACTIONS: "true"
|
GITHUB_ACTIONS: "true"
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd tools/server/tests
|
||||||
./tests.sh
|
./tests.sh
|
||||||
|
|
||||||
- name: Tests (sanitizers)
|
- name: Tests (sanitizers)
|
||||||
id: server_integration_tests_sanitizers
|
id: server_integration_tests_sanitizers
|
||||||
if: ${{ matrix.sanitizer != '' }}
|
if: ${{ matrix.sanitizer != '' }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd tools/server/tests
|
||||||
LLAMA_SANITIZE=1 ./tests.sh
|
LLAMA_SANITIZE=1 ./tests.sh
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
id: server_integration_tests_slow
|
id: server_integration_tests_slow
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd tools/server/tests
|
||||||
SLOW_TESTS=1 ./tests.sh
|
SLOW_TESTS=1 ./tests.sh
|
||||||
|
|
||||||
|
|
||||||
server-windows:
|
server-windows:
|
||||||
runs-on: windows-2019
|
runs-on: windows-2022
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
@ -211,7 +211,7 @@ jobs:
|
|||||||
- name: Tests dependencies
|
- name: Tests dependencies
|
||||||
id: test_dependencies
|
id: test_dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install -r examples/server/tests/requirements.txt
|
pip install -r tools/server/tests/requirements.txt
|
||||||
|
|
||||||
- name: Copy Libcurl
|
- name: Copy Libcurl
|
||||||
id: prepare_libcurl
|
id: prepare_libcurl
|
||||||
@ -224,7 +224,7 @@ jobs:
|
|||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd tools/server/tests
|
||||||
$env:PYTHONIOENCODING = ":replace"
|
$env:PYTHONIOENCODING = ":replace"
|
||||||
pytest -v -x -m "not slow"
|
pytest -v -x -m "not slow"
|
||||||
|
|
||||||
@ -232,6 +232,6 @@ jobs:
|
|||||||
id: server_integration_tests_slow
|
id: server_integration_tests_slow
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd tools/server/tests
|
||||||
$env:SLOW_TESTS = "1"
|
$env:SLOW_TESTS = "1"
|
||||||
pytest -v -x
|
pytest -v -x
|
||||||
|
42
.github/workflows/winget.yml
vendored
Normal file
42
.github/workflows/winget.yml
vendored
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
name: Update Winget Package
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
schedule:
|
||||||
|
- cron: '28 5 * * *' # Update every day at 5:28 UTC
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
update:
|
||||||
|
name: Update Winget Package
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Install cargo binstall
|
||||||
|
uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
|
||||||
|
|
||||||
|
- name: Install komac
|
||||||
|
run: |
|
||||||
|
cargo binstall komac@2.11.2 -y
|
||||||
|
|
||||||
|
- name: Find latest release
|
||||||
|
id: find_latest_release
|
||||||
|
uses: actions/github-script@v6
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
const { data: releases } = await github.rest.repos.listReleases({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
});
|
||||||
|
console.log("Latest release:", releases[0].tag_name);
|
||||||
|
return releases[0].tag_name;
|
||||||
|
|
||||||
|
- name: Update manifest
|
||||||
|
env:
|
||||||
|
VERSION: ${{ steps.find_latest_release.outputs.result }}
|
||||||
|
run: |
|
||||||
|
echo "Updating manifest..."
|
||||||
|
komac update --version ${{ env.VERSION }} \
|
||||||
|
--urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
|
||||||
|
--token ${{ secrets.WINGET_GITHUB_TOKEN }} \
|
||||||
|
--submit \
|
||||||
|
ggml.llamacpp
|
12
.gitignore
vendored
12
.gitignore
vendored
@ -96,11 +96,11 @@ perf-*.txt
|
|||||||
# Examples
|
# Examples
|
||||||
|
|
||||||
examples/jeopardy/results.txt
|
examples/jeopardy/results.txt
|
||||||
examples/server/*.css.hpp
|
tools/server/*.css.hpp
|
||||||
examples/server/*.html.hpp
|
tools/server/*.html.hpp
|
||||||
examples/server/*.js.hpp
|
tools/server/*.js.hpp
|
||||||
examples/server/*.mjs.hpp
|
tools/server/*.mjs.hpp
|
||||||
examples/server/*.gz.hpp
|
tools/server/*.gz.hpp
|
||||||
!build_64.sh
|
!build_64.sh
|
||||||
!examples/*.bat
|
!examples/*.bat
|
||||||
!examples/*/*.kts
|
!examples/*/*.kts
|
||||||
@ -110,7 +110,7 @@ examples/server/*.gz.hpp
|
|||||||
|
|
||||||
# Server Web UI temporary files
|
# Server Web UI temporary files
|
||||||
node_modules
|
node_modules
|
||||||
examples/server/webui/dist
|
tools/server/webui/dist
|
||||||
|
|
||||||
# Python
|
# Python
|
||||||
|
|
||||||
|
@ -77,6 +77,7 @@ option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE
|
|||||||
|
|
||||||
# extra artifacts
|
# extra artifacts
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
|
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||||
|
|
||||||
@ -158,6 +159,11 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
|||||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (MINGW)
|
||||||
|
# Target Windows 8 for PrefetchVirtualMemory
|
||||||
|
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# build the library
|
# build the library
|
||||||
#
|
#
|
||||||
@ -187,6 +193,10 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
|
|||||||
add_subdirectory(pocs)
|
add_subdirectory(pocs)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
|
||||||
|
add_subdirectory(tools)
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# install
|
# install
|
||||||
#
|
#
|
||||||
@ -247,20 +257,3 @@ configure_file(cmake/llama.pc.in
|
|||||||
|
|
||||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
||||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||||
|
|
||||||
#
|
|
||||||
# copy the license files
|
|
||||||
#
|
|
||||||
|
|
||||||
# Check if running in GitHub Actions
|
|
||||||
if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
|
||||||
message(STATUS "Running inside GitHub Actions - copying license files")
|
|
||||||
|
|
||||||
# Copy all files from licenses/ to build/bin/
|
|
||||||
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
|
||||||
foreach(LICENSE_FILE ${LICENSE_FILES})
|
|
||||||
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
|
||||||
configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
|
|
||||||
endforeach()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
@ -38,15 +38,6 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
|
||||||
"name": "arm64-windows-msvc", "hidden": true,
|
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
|
||||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "arm64-windows-llvm", "hidden": true,
|
"name": "arm64-windows-llvm", "hidden": true,
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
"architecture": { "value": "arm64", "strategy": "external" },
|
||||||
@ -73,10 +64,6 @@
|
|||||||
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
||||||
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
|
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
|
||||||
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
|
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
|
||||||
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
|
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
/ci/ @ggerganov
|
/ci/ @ggerganov
|
||||||
/.devops/*.Dockerfile @ngxson
|
/.devops/*.Dockerfile @ngxson
|
||||||
/examples/server/ @ngxson
|
/tools/server/ @ngxson
|
||||||
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
||||||
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
||||||
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
|
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
|
||||||
|
101
Makefile
101
Makefile
@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef GGML_NO_CPU_AARCH64
|
ifndef GGML_NO_CPU_AARCH64
|
||||||
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
|
MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# warnings
|
# warnings
|
||||||
@ -970,7 +970,7 @@ OBJ_GGML = \
|
|||||||
$(DIR_GGML)/src/ggml-threading.o \
|
$(DIR_GGML)/src/ggml-threading.o \
|
||||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
|
||||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
|
||||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
|
$(DIR_GGML)/src/ggml-cpu/repack.o \
|
||||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
|
||||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
|
||||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
|
||||||
@ -1156,10 +1156,10 @@ $(LIB_COMMON_S): $(OBJ_COMMON)
|
|||||||
|
|
||||||
# Clean generated server assets
|
# Clean generated server assets
|
||||||
clean-server-assets:
|
clean-server-assets:
|
||||||
find examples/server -type f -name "*.js.hpp" -delete
|
find tools/server -type f -name "*.js.hpp" -delete
|
||||||
find examples/server -type f -name "*.mjs.hpp" -delete
|
find tools/server -type f -name "*.mjs.hpp" -delete
|
||||||
find examples/server -type f -name "*.css.hpp" -delete
|
find tools/server -type f -name "*.css.hpp" -delete
|
||||||
find examples/server -type f -name "*.html.hpp" -delete
|
find tools/server -type f -name "*.html.hpp" -delete
|
||||||
|
|
||||||
# Clean rule
|
# Clean rule
|
||||||
clean: clean-server-assets
|
clean: clean-server-assets
|
||||||
@ -1179,7 +1179,7 @@ clean: clean-server-assets
|
|||||||
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
||||||
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
||||||
|
|
||||||
llama-cli: examples/main/main.cpp \
|
llama-cli: tools/main/main.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
@ -1187,12 +1187,7 @@ llama-cli: examples/main/main.cpp \
|
|||||||
@echo '==== Run ./llama-cli -h for help. ===='
|
@echo '==== Run ./llama-cli -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
llama-infill: examples/infill/infill.cpp \
|
llama-run: tools/run/run.cpp \
|
||||||
$(OBJ_ALL)
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
|
|
||||||
llama-run: examples/run/run.cpp \
|
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
@ -1207,7 +1202,7 @@ llama-simple-chat: examples/simple-chat/simple-chat.cpp \
|
|||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-tokenize: examples/tokenize/tokenize.cpp \
|
llama-tokenize: tools/tokenize/tokenize.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
@ -1217,27 +1212,27 @@ llama-batched: examples/batched/batched.cpp \
|
|||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-batched-bench: examples/batched-bench/batched-bench.cpp \
|
llama-batched-bench: tools/batched-bench/batched-bench.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-quantize: examples/quantize/quantize.cpp \
|
llama-quantize: tools/quantize/quantize.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
|
llama-quantize-stats: tools/quantize-stats/quantize-stats.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-perplexity: examples/perplexity/perplexity.cpp \
|
llama-perplexity: tools/perplexity/perplexity.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-imatrix: examples/imatrix/imatrix.cpp \
|
llama-imatrix: tools/imatrix/imatrix.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
@ -1279,7 +1274,7 @@ llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/s
|
|||||||
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-gguf-split: examples/gguf-split/gguf-split.cpp \
|
llama-gguf-split: tools/gguf-split/gguf-split.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
@ -1289,7 +1284,7 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp \
|
|||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
|
llama-cvector-generator: tools/cvector-generator/cvector-generator.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
@ -1299,12 +1294,12 @@ llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-
|
|||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-bench: examples/llama-bench/llama-bench.cpp \
|
llama-bench: tools/llama-bench/llama-bench.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-export-lora: examples/export-lora/export-lora.cpp \
|
llama-export-lora: tools/export-lora/export-lora.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
@ -1360,17 +1355,17 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
|
|||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
ifdef GGML_RPC
|
ifdef GGML_RPC
|
||||||
rpc-server: examples/rpc/rpc-server.cpp \
|
rpc-server: tools/rpc/rpc-server.cpp \
|
||||||
$(OBJ_GGML)
|
$(OBJ_GGML)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
endif # GGML_RPC
|
endif # GGML_RPC
|
||||||
|
|
||||||
llama-server: \
|
llama-server: \
|
||||||
examples/server/server.cpp \
|
tools/server/server.cpp \
|
||||||
examples/server/utils.hpp \
|
tools/server/utils.hpp \
|
||||||
examples/server/httplib.h \
|
tools/server/httplib.h \
|
||||||
examples/server/index.html.hpp \
|
tools/server/index.html.hpp \
|
||||||
examples/server/loading.html.hpp \
|
tools/server/loading.html.hpp \
|
||||||
common/chat.cpp \
|
common/chat.cpp \
|
||||||
common/chat.h \
|
common/chat.h \
|
||||||
common/chat-template.hpp \
|
common/chat-template.hpp \
|
||||||
@ -1378,10 +1373,10 @@ llama-server: \
|
|||||||
common/minja.hpp \
|
common/minja.hpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Itools/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
|
|
||||||
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
# Portable equivalent of `cd tools/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
||||||
examples/server/%.hpp: examples/server/public/% FORCE Makefile
|
tools/server/%.hpp: tools/server/public/% FORCE Makefile
|
||||||
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
|
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
|
||||||
echo "unsigned char $${NAME}[] = {" && \
|
echo "unsigned char $${NAME}[] = {" && \
|
||||||
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
|
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
|
||||||
@ -1394,36 +1389,36 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
|
|||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
libllava.a: examples/llava/llava.cpp \
|
libllava.a: tools/mtmd/llava.cpp \
|
||||||
examples/llava/llava.h \
|
tools/mtmd/llava.h \
|
||||||
examples/llava/clip.cpp \
|
tools/mtmd/clip.cpp \
|
||||||
examples/llava/clip.h \
|
tools/mtmd/clip.h \
|
||||||
common/stb_image.h \
|
common/stb_image.h \
|
||||||
common/base64.hpp \
|
common/base64.hpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||||
|
|
||||||
llama-llava-cli: examples/llava/llava-cli.cpp \
|
llama-llava-cli: tools/mtmd/llava-cli.cpp \
|
||||||
examples/llava/llava.cpp \
|
tools/mtmd/llava.cpp \
|
||||||
examples/llava/llava.h \
|
tools/mtmd/llava.h \
|
||||||
examples/llava/clip.cpp \
|
tools/mtmd/clip.cpp \
|
||||||
examples/llava/clip.h \
|
tools/mtmd/clip.h \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||||
|
|
||||||
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
|
llama-minicpmv-cli: tools/mtmd/minicpmv-cli.cpp \
|
||||||
examples/llava/llava.cpp \
|
tools/mtmd/llava.cpp \
|
||||||
examples/llava/llava.h \
|
tools/mtmd/llava.h \
|
||||||
examples/llava/clip.cpp \
|
tools/mtmd/clip.cpp \
|
||||||
examples/llava/clip.h \
|
tools/mtmd/clip.h \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||||
|
|
||||||
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
|
llama-qwen2vl-cli: tools/mtmd/qwen2vl-cli.cpp \
|
||||||
examples/llava/llava.cpp \
|
tools/mtmd/llava.cpp \
|
||||||
examples/llava/llava.h \
|
tools/mtmd/llava.h \
|
||||||
examples/llava/clip.cpp \
|
tools/mtmd/clip.cpp \
|
||||||
examples/llava/clip.h \
|
tools/mtmd/clip.h \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||||
|
|
||||||
@ -1480,12 +1475,12 @@ tests/test-double-float: tests/test-double-float.cpp
|
|||||||
|
|
||||||
tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
|
tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-chat: tests/test-chat.cpp \
|
tests/test-chat: tests/test-chat.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-opt: tests/test-opt.cpp \
|
tests/test-opt: tests/test-opt.cpp \
|
||||||
|
80
README.md
80
README.md
@ -3,6 +3,7 @@
|
|||||||

|

|
||||||
|
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
|
[](https://github.com/ggml-org/llama.cpp/releases)
|
||||||
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||||
@ -16,8 +17,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|||||||
|
|
||||||
## Hot topics
|
## Hot topics
|
||||||
|
|
||||||
|
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
||||||
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
|
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
|
||||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
|
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
|
||||||
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
||||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||||
@ -27,6 +29,30 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
|
||||||
|
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
|
||||||
|
|
||||||
|
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
|
||||||
|
- Run with Docker - see our [Docker documentation](docs/docker.md)
|
||||||
|
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
|
||||||
|
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
|
||||||
|
|
||||||
|
Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
|
||||||
|
|
||||||
|
Example command:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Use a local model file
|
||||||
|
llama-cli -m my_model.gguf
|
||||||
|
|
||||||
|
# Or download and run a model directly from Hugging Face
|
||||||
|
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
|
||||||
|
|
||||||
|
# Launch OpenAI-compatible API server
|
||||||
|
llama-server -hf ggml-org/gemma-3-1b-it-GGUF
|
||||||
|
```
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
||||||
@ -36,7 +62,7 @@ range of hardware - locally and in the cloud.
|
|||||||
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||||
- AVX, AVX2, AVX512 and AMX support for x86 architectures
|
- AVX, AVX2, AVX512 and AMX support for x86 architectures
|
||||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
|
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
|
||||||
- Vulkan and SYCL backend support
|
- Vulkan and SYCL backend support
|
||||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||||
|
|
||||||
@ -129,6 +155,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|||||||
<details>
|
<details>
|
||||||
<summary>Bindings</summary>
|
<summary>Bindings</summary>
|
||||||
|
|
||||||
|
- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
|
||||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||||
@ -228,6 +255,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
|
||||||
## Supported backends
|
## Supported backends
|
||||||
|
|
||||||
| Backend | Target devices |
|
| Backend | Target devices |
|
||||||
@ -236,23 +264,13 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|||||||
| [BLAS](docs/build.md#blas-build) | All |
|
| [BLAS](docs/build.md#blas-build) | All |
|
||||||
| [BLIS](docs/backend/BLIS.md) | All |
|
| [BLIS](docs/backend/BLIS.md) | All |
|
||||||
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
||||||
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
|
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
|
||||||
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
||||||
| [HIP](docs/build.md#hip) | AMD GPU |
|
| [HIP](docs/build.md#hip) | AMD GPU |
|
||||||
| [Vulkan](docs/build.md#vulkan) | GPU |
|
| [Vulkan](docs/build.md#vulkan) | GPU |
|
||||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) | All |
|
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||||
|
|
||||||
## Building the project
|
|
||||||
|
|
||||||
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
|
|
||||||
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
|
|
||||||
|
|
||||||
- Clone this repository and build locally, see [how to build](docs/build.md)
|
|
||||||
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
|
|
||||||
- Use a Docker image, see [documentation for Docker](docs/docker.md)
|
|
||||||
- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
|
|
||||||
|
|
||||||
## Obtaining and quantizing models
|
## Obtaining and quantizing models
|
||||||
|
|
||||||
@ -261,7 +279,11 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
|
|||||||
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
|
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
|
||||||
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
|
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
|
||||||
|
|
||||||
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
|
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
|
||||||
|
```
|
||||||
|
|
||||||
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
|
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
|
||||||
|
|
||||||
@ -276,9 +298,9 @@ The Hugging Face platform provides a variety of online tools for converting, qua
|
|||||||
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
|
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
|
||||||
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
|
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
|
||||||
|
|
||||||
To learn more about model quantization, [read this documentation](examples/quantize/README.md)
|
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
|
||||||
|
|
||||||
## [`llama-cli`](examples/main)
|
## [`llama-cli`](tools/main)
|
||||||
|
|
||||||
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
||||||
|
|
||||||
@ -341,7 +363,7 @@ To learn more about model quantization, [read this documentation](examples/quant
|
|||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
|
||||||
## [`llama-server`](examples/server)
|
## [`llama-server`](tools/server)
|
||||||
|
|
||||||
#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
|
#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
|
||||||
|
|
||||||
@ -411,7 +433,7 @@ To learn more about model quantization, [read this documentation](examples/quant
|
|||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
|
||||||
## [`llama-perplexity`](examples/perplexity)
|
## [`llama-perplexity`](tools/perplexity)
|
||||||
|
|
||||||
#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
|
#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
|
||||||
|
|
||||||
@ -436,10 +458,10 @@ To learn more about model quantization, [read this documentation](examples/quant
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
|
[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
|
||||||
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
||||||
|
|
||||||
## [`llama-bench`](examples/llama-bench)
|
## [`llama-bench`](tools/llama-bench)
|
||||||
|
|
||||||
#### Benchmark the performance of the inference for various parameters.
|
#### Benchmark the performance of the inference for various parameters.
|
||||||
|
|
||||||
@ -460,7 +482,7 @@ To learn more about model quantization, [read this documentation](examples/quant
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
## [`llama-run`](examples/run)
|
## [`llama-run`](tools/run)
|
||||||
|
|
||||||
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
|
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
|
||||||
|
|
||||||
@ -504,8 +526,8 @@ To learn more about model quantization, [read this documentation](examples/quant
|
|||||||
|
|
||||||
## Other documentation
|
## Other documentation
|
||||||
|
|
||||||
- [main (cli)](examples/main/README.md)
|
- [main (cli)](tools/main/README.md)
|
||||||
- [server](examples/server/README.md)
|
- [server](tools/server/README.md)
|
||||||
- [GBNF grammars](grammars/README.md)
|
- [GBNF grammars](grammars/README.md)
|
||||||
|
|
||||||
#### Development documentation
|
#### Development documentation
|
||||||
@ -571,4 +593,12 @@ automatically. For example:
|
|||||||
$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
|
$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
|
||||||
```
|
```
|
||||||
|
|
||||||
## References
|
## Dependencies
|
||||||
|
|
||||||
|
- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
|
||||||
|
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
|
||||||
|
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
|
||||||
|
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
|
||||||
|
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
|
||||||
|
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
|
||||||
|
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
|
||||||
|
@ -40,7 +40,7 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
|
|||||||
### Untrusted environments or networks
|
### Untrusted environments or networks
|
||||||
|
|
||||||
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
|
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
|
||||||
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
|
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
|
||||||
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
|
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
|
||||||
* Encrypt your data if sending it over the network.
|
* Encrypt your data if sending it over the network.
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ TVOS_MIN_OS_VERSION=16.4
|
|||||||
|
|
||||||
BUILD_SHARED_LIBS=OFF
|
BUILD_SHARED_LIBS=OFF
|
||||||
LLAMA_BUILD_EXAMPLES=OFF
|
LLAMA_BUILD_EXAMPLES=OFF
|
||||||
|
LLAMA_BUILD_TOOLS=OFF
|
||||||
LLAMA_BUILD_TESTS=OFF
|
LLAMA_BUILD_TESTS=OFF
|
||||||
LLAMA_BUILD_SERVER=OFF
|
LLAMA_BUILD_SERVER=OFF
|
||||||
GGML_METAL=ON
|
GGML_METAL=ON
|
||||||
@ -31,6 +32,7 @@ COMMON_CMAKE_ARGS=(
|
|||||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
|
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
|
||||||
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
|
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
|
||||||
|
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
|
||||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||||
-DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
|
-DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
|
||||||
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
|
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
|
||||||
@ -115,6 +117,7 @@ setup_framework_structure() {
|
|||||||
# Copy all required headers (common for all platforms)
|
# Copy all required headers (common for all platforms)
|
||||||
cp include/llama.h ${header_path}
|
cp include/llama.h ${header_path}
|
||||||
cp ggml/include/ggml.h ${header_path}
|
cp ggml/include/ggml.h ${header_path}
|
||||||
|
cp ggml/include/ggml-opt.h ${header_path}
|
||||||
cp ggml/include/ggml-alloc.h ${header_path}
|
cp ggml/include/ggml-alloc.h ${header_path}
|
||||||
cp ggml/include/ggml-backend.h ${header_path}
|
cp ggml/include/ggml-backend.h ${header_path}
|
||||||
cp ggml/include/ggml-metal.h ${header_path}
|
cp ggml/include/ggml-metal.h ${header_path}
|
||||||
|
@ -54,7 +54,7 @@ docker run --privileged -it \
|
|||||||
-v $HOME/llama.cpp/ci-cache:/ci-cache \
|
-v $HOME/llama.cpp/ci-cache:/ci-cache \
|
||||||
-v $HOME/llama.cpp/ci-results:/ci-results \
|
-v $HOME/llama.cpp/ci-results:/ci-results \
|
||||||
-v $PWD:/ws -w /ws \
|
-v $PWD:/ws -w /ws \
|
||||||
mthreads/musa:rc3.1.1-devel-ubuntu22.04
|
mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
|
||||||
```
|
```
|
||||||
|
|
||||||
Inside the container, execute the following commands:
|
Inside the container, execute the following commands:
|
||||||
|
23
ci/run.sh
23
ci/run.sh
@ -46,7 +46,20 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
|
||||||
|
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
|
||||||
|
if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
|
||||||
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
|
||||||
|
else
|
||||||
|
echo "Warning: Using fallback CUDA architectures"
|
||||||
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Error: nvidia-smi not found, cannot build with CUDA"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
@ -187,8 +200,8 @@ function gg_run_test_scripts_debug {
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
(cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
(cd ./tools/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
@ -211,8 +224,8 @@ function gg_run_test_scripts_release {
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
(cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
(cd ./tools/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
@ -1,6 +0,0 @@
|
|||||||
set( CMAKE_SYSTEM_NAME Windows )
|
|
||||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
|
||||||
|
|
||||||
set( target arm64-pc-windows-msvc )
|
|
||||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
|
||||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
|
@ -3,9 +3,3 @@ set( CMAKE_SYSTEM_PROCESSOR x86_64 )
|
|||||||
|
|
||||||
set( CMAKE_C_COMPILER clang )
|
set( CMAKE_C_COMPILER clang )
|
||||||
set( CMAKE_CXX_COMPILER clang++ )
|
set( CMAKE_CXX_COMPILER clang++ )
|
||||||
|
|
||||||
set( arch_c_flags "-march=native" )
|
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
|
|
||||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
|
|
||||||
|
|
||||||
|
@ -58,21 +58,24 @@ add_library(${TARGET} STATIC
|
|||||||
arg.cpp
|
arg.cpp
|
||||||
arg.h
|
arg.h
|
||||||
base64.hpp
|
base64.hpp
|
||||||
|
chat-parser.cpp
|
||||||
|
chat-parser.h
|
||||||
chat.cpp
|
chat.cpp
|
||||||
chat.h
|
chat.h
|
||||||
common.cpp
|
common.cpp
|
||||||
common.h
|
common.h
|
||||||
console.cpp
|
console.cpp
|
||||||
console.h
|
console.h
|
||||||
|
json-partial.cpp
|
||||||
|
json-partial.h
|
||||||
json-schema-to-grammar.cpp
|
json-schema-to-grammar.cpp
|
||||||
json.hpp
|
|
||||||
llguidance.cpp
|
llguidance.cpp
|
||||||
log.cpp
|
log.cpp
|
||||||
log.h
|
log.h
|
||||||
minja/chat-template.hpp
|
|
||||||
minja/minja.hpp
|
|
||||||
ngram-cache.cpp
|
ngram-cache.cpp
|
||||||
ngram-cache.h
|
ngram-cache.h
|
||||||
|
regex-partial.cpp
|
||||||
|
regex-partial.h
|
||||||
sampling.cpp
|
sampling.cpp
|
||||||
sampling.h
|
sampling.h
|
||||||
speculative.cpp
|
speculative.cpp
|
||||||
@ -119,8 +122,8 @@ if (LLAMA_LLGUIDANCE)
|
|||||||
|
|
||||||
ExternalProject_Add(llguidance_ext
|
ExternalProject_Add(llguidance_ext
|
||||||
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
||||||
# v0.7.10:
|
# v0.7.20 (+ fix to build on GCC 15):
|
||||||
GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
|
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
|
||||||
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
||||||
SOURCE_DIR ${LLGUIDANCE_SRC}
|
SOURCE_DIR ${LLGUIDANCE_SRC}
|
||||||
BUILD_IN_SOURCE TRUE
|
BUILD_IN_SOURCE TRUE
|
||||||
@ -141,6 +144,30 @@ if (LLAMA_LLGUIDANCE)
|
|||||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC . ../vendor)
|
||||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# copy the license files
|
||||||
|
#
|
||||||
|
|
||||||
|
# Check if running in GitHub Actions
|
||||||
|
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
||||||
|
message(STATUS "Running inside GitHub Actions - copying license files")
|
||||||
|
|
||||||
|
# Copy all files from licenses/ to build/bin/
|
||||||
|
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
||||||
|
foreach(LICENSE_FILE ${LICENSE_FILES})
|
||||||
|
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
||||||
|
add_custom_command(
|
||||||
|
POST_BUILD
|
||||||
|
TARGET ${TARGET}
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||||
|
"${LICENSE_FILE}"
|
||||||
|
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
|
||||||
|
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
|
||||||
|
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
355
common/arg.cpp
355
common/arg.cpp
@ -1,10 +1,11 @@
|
|||||||
#include "gguf.h" // for reading GGUF splits
|
|
||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
|
|
||||||
|
#include "chat.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "gguf.h" // for reading GGUF splits
|
||||||
|
#include "json-schema-to-grammar.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
#include "chat.h"
|
|
||||||
|
|
||||||
// fix problem with std::min and std::max
|
// fix problem with std::min and std::max
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
@ -15,6 +16,9 @@
|
|||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
@ -34,13 +38,11 @@
|
|||||||
#include <future>
|
#include <future>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "json-schema-to-grammar.h"
|
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
std::initializer_list<enum llama_example> mmproj_examples = {
|
std::initializer_list<enum llama_example> mmproj_examples = {
|
||||||
LLAMA_EXAMPLE_LLAVA,
|
LLAMA_EXAMPLE_MTMD,
|
||||||
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
|
LLAMA_EXAMPLE_SERVER,
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::string read_file(const std::string & fname) {
|
static std::string read_file(const std::string & fname) {
|
||||||
@ -242,7 +244,56 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
|
|||||||
}
|
}
|
||||||
|
|
||||||
// download one single file from remote URL to local path
|
// download one single file from remote URL to local path
|
||||||
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
|
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
|
||||||
|
// Check if the file already exists locally
|
||||||
|
auto file_exists = std::filesystem::exists(path);
|
||||||
|
|
||||||
|
// If the file exists, check its JSON metadata companion file.
|
||||||
|
std::string metadata_path = path + ".json";
|
||||||
|
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
||||||
|
std::string etag;
|
||||||
|
std::string last_modified;
|
||||||
|
|
||||||
|
if (file_exists) {
|
||||||
|
if (offline) {
|
||||||
|
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
||||||
|
return true; // skip verification/downloading
|
||||||
|
}
|
||||||
|
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
||||||
|
std::ifstream metadata_in(metadata_path);
|
||||||
|
if (metadata_in.good()) {
|
||||||
|
try {
|
||||||
|
metadata_in >> metadata;
|
||||||
|
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||||
|
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||||
|
etag = metadata.at("etag");
|
||||||
|
}
|
||||||
|
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
||||||
|
last_modified = metadata.at("lastModified");
|
||||||
|
}
|
||||||
|
} catch (const nlohmann::json::exception & e) {
|
||||||
|
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
||||||
|
} else {
|
||||||
|
if (offline) {
|
||||||
|
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||||
|
struct common_load_model_from_url_headers {
|
||||||
|
std::string etag;
|
||||||
|
std::string last_modified;
|
||||||
|
};
|
||||||
|
|
||||||
|
common_load_model_from_url_headers headers;
|
||||||
|
bool head_request_ok = false;
|
||||||
|
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
||||||
|
|
||||||
// Initialize libcurl
|
// Initialize libcurl
|
||||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||||
curl_slist_ptr http_headers;
|
curl_slist_ptr http_headers;
|
||||||
@ -269,91 +320,47 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|||||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Check if the file already exists locally
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||||
auto file_exists = std::filesystem::exists(path);
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||||
|
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
||||||
|
|
||||||
// If the file exists, check its JSON metadata companion file.
|
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||||
std::string metadata_path = path + ".json";
|
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||||
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
||||||
std::string etag;
|
|
||||||
std::string last_modified;
|
|
||||||
|
|
||||||
if (file_exists) {
|
std::string header(buffer, n_items);
|
||||||
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
std::smatch match;
|
||||||
std::ifstream metadata_in(metadata_path);
|
if (std::regex_match(header, match, header_regex)) {
|
||||||
if (metadata_in.good()) {
|
const std::string & key = match[1];
|
||||||
try {
|
const std::string & value = match[2];
|
||||||
metadata_in >> metadata;
|
if (std::regex_match(key, match, etag_regex)) {
|
||||||
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
headers->etag = value;
|
||||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
} else if (std::regex_match(key, match, last_modified_regex)) {
|
||||||
etag = metadata.at("etag");
|
headers->last_modified = value;
|
||||||
}
|
|
||||||
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
|
||||||
last_modified = metadata.at("lastModified");
|
|
||||||
}
|
|
||||||
} catch (const nlohmann::json::exception & e) {
|
|
||||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
return n_items;
|
||||||
} else {
|
|
||||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
|
||||||
struct common_load_model_from_url_headers {
|
|
||||||
std::string etag;
|
|
||||||
std::string last_modified;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
common_load_model_from_url_headers headers;
|
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
||||||
bool head_request_ok = false;
|
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
||||||
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
||||||
|
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||||
|
|
||||||
// get ETag to see if the remote file has changed
|
// we only allow retrying once for HEAD requests
|
||||||
{
|
// this is for the use case of using running offline (no internet), retrying can be annoying
|
||||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
||||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
if (!was_perform_successful) {
|
||||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
head_request_ok = false;
|
||||||
|
}
|
||||||
|
|
||||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
long http_code = 0;
|
||||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||||
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
if (http_code == 200) {
|
||||||
|
head_request_ok = true;
|
||||||
std::string header(buffer, n_items);
|
} else {
|
||||||
std::smatch match;
|
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||||
if (std::regex_match(header, match, header_regex)) {
|
head_request_ok = false;
|
||||||
const std::string & key = match[1];
|
|
||||||
const std::string & value = match[2];
|
|
||||||
if (std::regex_match(key, match, etag_regex)) {
|
|
||||||
headers->etag = value;
|
|
||||||
} else if (std::regex_match(key, match, last_modified_regex)) {
|
|
||||||
headers->last_modified = value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return n_items;
|
|
||||||
};
|
|
||||||
|
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
|
||||||
|
|
||||||
// we only allow retrying once for HEAD requests
|
|
||||||
// this is for the use case of using running offline (no internet), retrying can be annoying
|
|
||||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
|
||||||
if (!was_perform_successful) {
|
|
||||||
head_request_ok = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
long http_code = 0;
|
|
||||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
|
||||||
if (http_code == 200) {
|
|
||||||
head_request_ok = true;
|
|
||||||
} else {
|
|
||||||
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
|
||||||
head_request_ok = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// if head_request_ok is false, we don't have the etag or last-modified headers
|
// if head_request_ok is false, we don't have the etag or last-modified headers
|
||||||
@ -460,12 +467,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|||||||
|
|
||||||
// download multiple files from remote URLs to local paths
|
// download multiple files from remote URLs to local paths
|
||||||
// the input is a vector of pairs <url, path>
|
// the input is a vector of pairs <url, path>
|
||||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
|
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
|
||||||
// Prepare download in parallel
|
// Prepare download in parallel
|
||||||
std::vector<std::future<bool>> futures_download;
|
std::vector<std::future<bool>> futures_download;
|
||||||
for (auto const & item : urls) {
|
for (auto const & item : urls) {
|
||||||
futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
|
futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
|
||||||
return common_download_file_single(it.first, it.second, bearer_token);
|
return common_download_file_single(it.first, it.second, bearer_token, offline);
|
||||||
}, item));
|
}, item));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -481,14 +488,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
|
|||||||
|
|
||||||
static bool common_download_model(
|
static bool common_download_model(
|
||||||
const common_params_model & model,
|
const common_params_model & model,
|
||||||
const std::string & bearer_token) {
|
const std::string & bearer_token,
|
||||||
|
bool offline) {
|
||||||
// Basic validation of the model.url
|
// Basic validation of the model.url
|
||||||
if (model.url.empty()) {
|
if (model.url.empty()) {
|
||||||
LOG_ERR("%s: invalid model url\n", __func__);
|
LOG_ERR("%s: invalid model url\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!common_download_file_single(model.url, model.path, bearer_token)) {
|
if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -547,7 +555,7 @@ static bool common_download_model(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Download in parallel
|
// Download in parallel
|
||||||
common_download_file_multiple(urls, bearer_token);
|
common_download_file_multiple(urls, bearer_token, offline);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -608,7 +616,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
|||||||
*
|
*
|
||||||
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
||||||
*/
|
*/
|
||||||
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
|
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
|
||||||
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
||||||
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
||||||
std::string hf_repo = parts[0];
|
std::string hf_repo = parts[0];
|
||||||
@ -638,20 +646,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
|||||||
long res_code = 0;
|
long res_code = 0;
|
||||||
std::string res_str;
|
std::string res_str;
|
||||||
bool use_cache = false;
|
bool use_cache = false;
|
||||||
try {
|
if (!offline) {
|
||||||
auto res = common_remote_get_content(url, params);
|
|
||||||
res_code = res.first;
|
|
||||||
res_str = std::string(res.second.data(), res.second.size());
|
|
||||||
} catch (const std::exception & e) {
|
|
||||||
LOG_WRN("error: failed to get manifest: %s\n", e.what());
|
|
||||||
LOG_WRN("try reading from cache\n");
|
|
||||||
// try to read from cache
|
|
||||||
try {
|
try {
|
||||||
|
auto res = common_remote_get_content(url, params);
|
||||||
|
res_code = res.first;
|
||||||
|
res_str = std::string(res.second.data(), res.second.size());
|
||||||
|
} catch (const std::exception & e) {
|
||||||
|
LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (res_code == 0) {
|
||||||
|
if (std::filesystem::exists(cached_response_path)) {
|
||||||
|
LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
|
||||||
res_str = read_file(cached_response_path);
|
res_str = read_file(cached_response_path);
|
||||||
res_code = 200;
|
res_code = 200;
|
||||||
use_cache = true;
|
use_cache = true;
|
||||||
} catch (const std::exception & e) {
|
} else {
|
||||||
throw std::runtime_error("error: failed to get manifest (check your internet connection)");
|
throw std::runtime_error(
|
||||||
|
offline ? "error: failed to get manifest (offline mode)"
|
||||||
|
: "error: failed to get manifest (check your internet connection)");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::string ggufFile;
|
std::string ggufFile;
|
||||||
@ -698,24 +711,25 @@ bool common_has_curl() {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
|
static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
|
||||||
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
|
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
|
||||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool common_download_model(
|
static bool common_download_model(
|
||||||
const common_params_model &,
|
const common_params_model &,
|
||||||
const std::string &) {
|
const std::string &,
|
||||||
|
bool) {
|
||||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
|
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
|
||||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
@ -742,7 +756,8 @@ struct handle_model_result {
|
|||||||
static handle_model_result common_params_handle_model(
|
static handle_model_result common_params_handle_model(
|
||||||
struct common_params_model & model,
|
struct common_params_model & model,
|
||||||
const std::string & bearer_token,
|
const std::string & bearer_token,
|
||||||
const std::string & model_path_default) {
|
const std::string & model_path_default,
|
||||||
|
bool offline) {
|
||||||
handle_model_result result;
|
handle_model_result result;
|
||||||
// handle pre-fill default model path and url based on hf_repo and hf_file
|
// handle pre-fill default model path and url based on hf_repo and hf_file
|
||||||
{
|
{
|
||||||
@ -750,7 +765,7 @@ static handle_model_result common_params_handle_model(
|
|||||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||||
if (model.hf_file.empty()) {
|
if (model.hf_file.empty()) {
|
||||||
if (model.path.empty()) {
|
if (model.path.empty()) {
|
||||||
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
|
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
|
||||||
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
||||||
exit(1); // built without CURL, error message already printed
|
exit(1); // built without CURL, error message already printed
|
||||||
}
|
}
|
||||||
@ -791,7 +806,7 @@ static handle_model_result common_params_handle_model(
|
|||||||
|
|
||||||
// then, download it if needed
|
// then, download it if needed
|
||||||
if (!model.url.empty()) {
|
if (!model.url.empty()) {
|
||||||
bool ok = common_download_model(model, bearer_token);
|
bool ok = common_download_model(model, bearer_token, offline);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
@ -934,7 +949,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||||||
|
|
||||||
// handle model and download
|
// handle model and download
|
||||||
{
|
{
|
||||||
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
|
||||||
if (params.no_mmproj) {
|
if (params.no_mmproj) {
|
||||||
params.mmproj = {};
|
params.mmproj = {};
|
||||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||||
@ -944,12 +959,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||||||
// only download mmproj if the current example is using it
|
// only download mmproj if the current example is using it
|
||||||
for (auto & ex : mmproj_examples) {
|
for (auto & ex : mmproj_examples) {
|
||||||
if (ctx_arg.ex == ex) {
|
if (ctx_arg.ex == ex) {
|
||||||
common_params_handle_model(params.mmproj, params.hf_token, "");
|
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
|
||||||
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.escape) {
|
if (params.escape) {
|
||||||
@ -1283,7 +1298,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.use_color = true;
|
params.use_color = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-t", "--threads"}, "N",
|
{"-t", "--threads"}, "N",
|
||||||
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
||||||
@ -1333,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--prio"}, "N",
|
{"--prio"}, "N",
|
||||||
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
|
string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
|
||||||
[](common_params & params, int prio) {
|
[](common_params & params, int prio) {
|
||||||
if (prio < 0 || prio > 3) {
|
if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
|
||||||
throw std::invalid_argument("invalid value");
|
throw std::invalid_argument("invalid value");
|
||||||
}
|
}
|
||||||
params.cpuparams.priority = (enum ggml_sched_priority) prio;
|
params.cpuparams.priority = (enum ggml_sched_priority) prio;
|
||||||
@ -1416,7 +1431,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-n", "--predict", "--n-predict"}, "N",
|
{"-n", "--predict", "--n-predict"}, "N",
|
||||||
string_format(
|
string_format(
|
||||||
ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
|
ex == LLAMA_EXAMPLE_MAIN
|
||||||
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
||||||
: "number of tokens to predict (default: %d, -1 = infinity)",
|
: "number of tokens to predict (default: %d, -1 = infinity)",
|
||||||
params.n_predict),
|
params.n_predict),
|
||||||
@ -1445,6 +1460,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.n_keep = value;
|
params.n_keep = value;
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--swa-full"},
|
||||||
|
string_format("use full-size SWA cache (default: %s)\n"
|
||||||
|
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
|
||||||
|
[](common_params & params) {
|
||||||
|
params.swa_full = true;
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_SWA_FULL"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-context-shift"},
|
{"--no-context-shift"},
|
||||||
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
||||||
@ -1655,7 +1678,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.input_prefix = value;
|
params.input_prefix = value;
|
||||||
params.enable_chat_template = false;
|
params.enable_chat_template = false;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--in-suffix"}, "STRING",
|
{"--in-suffix"}, "STRING",
|
||||||
"string to suffix after user inputs with (default: empty)",
|
"string to suffix after user inputs with (default: empty)",
|
||||||
@ -1663,14 +1686,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.input_suffix = value;
|
params.input_suffix = value;
|
||||||
params.enable_chat_template = false;
|
params.enable_chat_template = false;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-warmup"},
|
{"--no-warmup"},
|
||||||
"skip warming up the model with an empty run",
|
"skip warming up the model with an empty run",
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--spm-infill"},
|
{"--spm-infill"},
|
||||||
string_format(
|
string_format(
|
||||||
@ -1680,7 +1703,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.spm_infill = true;
|
params.spm_infill = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--samplers"}, "SAMPLERS",
|
{"--samplers"}, "SAMPLERS",
|
||||||
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
||||||
@ -2057,13 +2080,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.grp_attn_w = value;
|
params.grp_attn_w = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
|
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||||
add_opt(common_arg(
|
|
||||||
{"-dkvc", "--dump-kv-cache"},
|
|
||||||
"verbose print of the KV cache",
|
|
||||||
[](common_params & params) {
|
|
||||||
params.dump_kv_cache = true;
|
|
||||||
}
|
|
||||||
));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-nkvo", "--no-kv-offload"},
|
{"-nkvo", "--no-kv-offload"},
|
||||||
"disable KV offload",
|
"disable KV offload",
|
||||||
@ -2097,13 +2113,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.cache_type_v = kv_cache_type_from_str(value);
|
params.cache_type_v = kv_cache_type_from_str(value);
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"--perplexity", "--all-logits"},
|
|
||||||
string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
|
||||||
[](common_params & params) {
|
|
||||||
params.logits_all = true;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--hellaswag"},
|
{"--hellaswag"},
|
||||||
"compute HellaSwag score over random tasks from datafile supplied with -f",
|
"compute HellaSwag score over random tasks from datafile supplied with -f",
|
||||||
@ -2211,39 +2220,40 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mmproj"}, "FILE",
|
{"--mmproj"}, "FILE",
|
||||||
"path to a multimodal projector file. see examples/llava/README.md",
|
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
||||||
|
"note: if -hf is used, this argument can be omitted",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.mmproj.path = value;
|
params.mmproj.path = value;
|
||||||
}
|
}
|
||||||
).set_examples(mmproj_examples));
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mmproj-url"}, "URL",
|
{"--mmproj-url"}, "URL",
|
||||||
"URL to a multimodal projector file. see examples/llava/README.md",
|
"URL to a multimodal projector file. see tools/mtmd/README.md",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.mmproj.url = value;
|
params.mmproj.url = value;
|
||||||
}
|
}
|
||||||
).set_examples(mmproj_examples));
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-mmproj"},
|
{"--no-mmproj"},
|
||||||
"explicitly disable multimodal projector, useful when using -hf",
|
"explicitly disable multimodal projector, useful when using -hf",
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.no_mmproj = true;
|
params.no_mmproj = true;
|
||||||
}
|
}
|
||||||
).set_examples(mmproj_examples));
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-mmproj-offload"},
|
{"--no-mmproj-offload"},
|
||||||
"do not offload multimodal projector to GPU",
|
"do not offload multimodal projector to GPU",
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.mmproj_use_gpu = false;
|
params.mmproj_use_gpu = false;
|
||||||
}
|
}
|
||||||
).set_examples(mmproj_examples));
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--image"}, "FILE",
|
{"--image", "--audio"}, "FILE",
|
||||||
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.image.emplace_back(value);
|
params.image.emplace_back(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
).set_examples({LLAMA_EXAMPLE_MTMD}));
|
||||||
if (llama_supports_rpc()) {
|
if (llama_supports_rpc()) {
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--rpc"}, "SERVERS",
|
{"--rpc"}, "SERVERS",
|
||||||
@ -2443,6 +2453,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--no-op-offload"},
|
||||||
|
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
|
||||||
|
[](common_params & params) {
|
||||||
|
params.no_op_offload = true;
|
||||||
|
}
|
||||||
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--lora"}, "FNAME",
|
{"--lora"}, "FNAME",
|
||||||
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
||||||
@ -2584,7 +2601,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_junk = value;
|
params.n_junk = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--pos"}, "N",
|
{"--pos"}, "N",
|
||||||
string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
|
string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
|
||||||
@ -2634,13 +2651,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.i_chunk = value;
|
params.i_chunk = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--parse-special"},
|
||||||
|
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
|
||||||
|
[](common_params & params) {
|
||||||
|
params.parse_special = true;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-pps"},
|
{"-pps"},
|
||||||
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.is_pp_shared = true;
|
params.is_pp_shared = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-npp"}, "n0,n1,...",
|
{"-npp"}, "n0,n1,...",
|
||||||
"number of prompt tokens",
|
"number of prompt tokens",
|
||||||
@ -2839,15 +2863,25 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--reasoning-format"}, "FORMAT",
|
{"--reasoning-format"}, "FORMAT",
|
||||||
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
|
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
||||||
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
|
"- none: leaves thoughts unparsed in `message.content`\n"
|
||||||
"only supported for non-streamed responses",
|
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
|
||||||
|
"(default: deepseek)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
||||||
|
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
|
||||||
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
||||||
else { std::invalid_argument("invalid value"); }
|
else { throw std::invalid_argument("invalid value"); }
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--reasoning-budget"}, "N",
|
||||||
|
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
|
||||||
|
params.reasoning_budget = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--chat-template"}, "JINJA_TEMPLATE",
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
||||||
string_format(
|
string_format(
|
||||||
@ -2859,7 +2893,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.chat_template = value;
|
params.chat_template = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
||||||
string_format(
|
string_format(
|
||||||
@ -2872,6 +2906,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.chat_template = read_file(value);
|
params.chat_template = read_file(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--no-prefill-assistant"},
|
||||||
|
string_format(
|
||||||
|
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
|
||||||
|
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
|
||||||
|
),
|
||||||
|
[](common_params & params) {
|
||||||
|
params.prefill_assistant = false;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
||||||
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
||||||
@ -2892,7 +2936,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.simple_io = true;
|
params.simple_io = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--positive-file"}, "FNAME",
|
{"--positive-file"}, "FNAME",
|
||||||
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
||||||
@ -2936,7 +2980,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
||||||
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
||||||
else { std::invalid_argument("invalid value"); }
|
else { throw std::invalid_argument("invalid value"); }
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
@ -2968,6 +3012,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
common_log_set_verbosity_thold(INT_MAX);
|
common_log_set_verbosity_thold(INT_MAX);
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--offline"},
|
||||||
|
"Offline mode: forces use of cache, prevents network access",
|
||||||
|
[](common_params & params) {
|
||||||
|
params.offline = true;
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_OFFLINE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
||||||
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
|
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
|
||||||
|
380
common/chat-parser.cpp
Normal file
380
common/chat-parser.cpp
Normal file
@ -0,0 +1,380 @@
|
|||||||
|
#include "chat-parser.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
|
#include "regex-partial.h"
|
||||||
|
|
||||||
|
#include <optional>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
|
common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
|
||||||
|
: input_(input), is_partial_(is_partial), syntax_(syntax)
|
||||||
|
{
|
||||||
|
result_.role = "assistant";
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
std::string id = std::to_string(std::rand());
|
||||||
|
if (input.find(id) == std::string::npos) {
|
||||||
|
healing_marker_ = id;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string common_chat_msg_parser::str(const common_string_range & rng) const {
|
||||||
|
GGML_ASSERT(rng.begin <= rng.end);
|
||||||
|
return input_.substr(rng.begin, rng.end - rng.begin);
|
||||||
|
}
|
||||||
|
|
||||||
|
void common_chat_msg_parser::add_content(const std::string &content) {
|
||||||
|
result_.content += content;
|
||||||
|
}
|
||||||
|
|
||||||
|
void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
|
||||||
|
result_.reasoning_content += reasoning_content;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
|
||||||
|
if (name.empty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_chat_tool_call tool_call;
|
||||||
|
tool_call.name = name;
|
||||||
|
tool_call.arguments = arguments;
|
||||||
|
tool_call.id = id;
|
||||||
|
|
||||||
|
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
|
||||||
|
result_.tool_calls.emplace_back(tool_call);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
||||||
|
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
|
||||||
|
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
|
||||||
|
std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
|
||||||
|
return add_tool_call(name, id, arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool common_chat_msg_parser::add_tool_calls(const json & arr) {
|
||||||
|
for (const auto & item : arr) {
|
||||||
|
if (!add_tool_call(item)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
void common_chat_msg_parser::finish() {
|
||||||
|
if (!is_partial_ && pos_ != input_.size()) {
|
||||||
|
throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool common_chat_msg_parser::consume_spaces() {
|
||||||
|
const auto length = input_.size();
|
||||||
|
auto consumed = false;
|
||||||
|
while (pos_ < length && std::isspace(input_[pos_])) {
|
||||||
|
++pos_;
|
||||||
|
consumed = true;
|
||||||
|
}
|
||||||
|
return consumed;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
|
||||||
|
auto pos = pos_;
|
||||||
|
for (auto i = 0u; i < literal.size(); ++i) {
|
||||||
|
if (pos >= input_.size()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (input_[pos] != literal[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
++pos;
|
||||||
|
}
|
||||||
|
pos_ = pos;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_literal(const std::string & literal) {
|
||||||
|
auto idx = input_.find(literal, pos_);
|
||||||
|
if (idx != std::string::npos) {
|
||||||
|
find_regex_result res;
|
||||||
|
res.prelude = input_.substr(pos_, idx - pos_);
|
||||||
|
auto end = idx + literal.size();
|
||||||
|
res.groups.emplace_back(common_string_range{idx, end});
|
||||||
|
move_to(end);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
if (is_partial_) {
|
||||||
|
idx = string_find_partial_stop(input_, literal);
|
||||||
|
if (idx != std::string::npos && idx >= pos_) {
|
||||||
|
find_regex_result res;
|
||||||
|
res.prelude = input_.substr(pos_, idx - pos_);
|
||||||
|
auto end = input_.size();
|
||||||
|
res.groups.emplace_back(common_string_range{idx, end});
|
||||||
|
move_to(end);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
|
||||||
|
void common_chat_msg_parser::consume_literal(const std::string & literal) {
|
||||||
|
if (!try_consume_literal(literal)) {
|
||||||
|
throw common_chat_msg_partial_exception(literal);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
|
||||||
|
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
|
||||||
|
auto stripped_reasoning = string_strip(reasoning);
|
||||||
|
if (stripped_reasoning.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (syntax_.reasoning_in_content) {
|
||||||
|
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
|
||||||
|
add_content(stripped_reasoning);
|
||||||
|
if (closed) {
|
||||||
|
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
add_reasoning_content(stripped_reasoning);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
|
||||||
|
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
|
||||||
|
if (auto res = try_find_literal(end_think)) {
|
||||||
|
handle_reasoning(res->prelude, /* closed */ true);
|
||||||
|
consume_spaces();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
auto rest = consume_rest();
|
||||||
|
if (!rest.empty()) {
|
||||||
|
handle_reasoning(rest, /* closed */ !is_partial());
|
||||||
|
}
|
||||||
|
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
|
||||||
|
// if (!syntax_.thinking_forced_open) {
|
||||||
|
// throw common_chat_msg_partial_exception(end_think);
|
||||||
|
// }
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string common_chat_msg_parser::consume_rest() {
|
||||||
|
auto rest = input_.substr(pos_);
|
||||||
|
pos_ = input_.size();
|
||||||
|
return rest;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
|
||||||
|
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
|
||||||
|
auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
|
||||||
|
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
|
||||||
|
pos_ = m.groups[0].end;
|
||||||
|
|
||||||
|
if (add_prelude_to_content) {
|
||||||
|
add_content(prelude);
|
||||||
|
}
|
||||||
|
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
|
||||||
|
if (is_partial()) {
|
||||||
|
throw common_chat_msg_partial_exception(regex.str());
|
||||||
|
}
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
return find_regex_result{prelude, m.groups};
|
||||||
|
}
|
||||||
|
|
||||||
|
common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
|
||||||
|
if (auto result = try_consume_regex(regex)) {
|
||||||
|
return *result;
|
||||||
|
}
|
||||||
|
throw common_chat_msg_partial_exception(regex.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
|
||||||
|
auto m = regex.search(input_, pos_);
|
||||||
|
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
|
||||||
|
if (is_partial()) {
|
||||||
|
throw common_chat_msg_partial_exception(regex.str());
|
||||||
|
}
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
if (m.groups[0].begin != pos_) {
|
||||||
|
// Didn't match at the current position.
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
pos_ = m.groups[0].end;
|
||||||
|
|
||||||
|
return find_regex_result {
|
||||||
|
/* .prelude = */ "",
|
||||||
|
m.groups,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::optional<common_json> common_chat_msg_parser::try_consume_json() {
|
||||||
|
auto it = input_.cbegin() + pos_;
|
||||||
|
const auto end = input_.cend();
|
||||||
|
common_json result;
|
||||||
|
if (!common_json_parse(it, end, healing_marker_, result)) {
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
pos_ = std::distance(input_.cbegin(), it);
|
||||||
|
if (result.healing_marker.marker.empty()) {
|
||||||
|
// No healing marker, just return the parsed json
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
if (!is_partial()) {
|
||||||
|
throw common_chat_msg_partial_exception("JSON");
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_json common_chat_msg_parser::consume_json() {
|
||||||
|
if (auto result = try_consume_json()) {
|
||||||
|
return *result;
|
||||||
|
}
|
||||||
|
throw common_chat_msg_partial_exception("JSON");
|
||||||
|
}
|
||||||
|
|
||||||
|
common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
|
||||||
|
const std::vector<std::vector<std::string>> & args_paths,
|
||||||
|
const std::vector<std::vector<std::string>> & content_paths
|
||||||
|
) {
|
||||||
|
if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
|
||||||
|
return *result;
|
||||||
|
}
|
||||||
|
throw common_chat_msg_partial_exception("JSON");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
|
||||||
|
const std::vector<std::vector<std::string>> & args_paths,
|
||||||
|
const std::vector<std::vector<std::string>> & content_paths
|
||||||
|
) {
|
||||||
|
auto partial = try_consume_json();
|
||||||
|
if (!partial) {
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
auto is_arguments_path = [&](const std::vector<std::string> & path) {
|
||||||
|
return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
|
||||||
|
};
|
||||||
|
auto is_content_path = [&](const std::vector<std::string> & path) {
|
||||||
|
return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
|
||||||
|
};
|
||||||
|
|
||||||
|
if (partial->healing_marker.marker.empty()) {
|
||||||
|
if (args_paths.empty()) {
|
||||||
|
// No arguments to dump, and JSON was parsed fully.
|
||||||
|
return consume_json_result {
|
||||||
|
partial->json,
|
||||||
|
/* .is_partial = */ false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (is_arguments_path({})) {
|
||||||
|
// Entire JSON is the arguments and was parsed fully.
|
||||||
|
return consume_json_result {
|
||||||
|
partial->json.dump(),
|
||||||
|
/* .is_partial = */ false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
|
||||||
|
|
||||||
|
auto found_healing_marker = false;
|
||||||
|
std::vector<std::string> path;
|
||||||
|
std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
|
||||||
|
if (is_arguments_path(path)) {
|
||||||
|
auto arguments = j.dump();
|
||||||
|
if (is_partial() && !partial->healing_marker.marker.empty()) {
|
||||||
|
auto idx = arguments.find(partial->healing_marker.json_dump_marker);
|
||||||
|
if (idx != std::string::npos) {
|
||||||
|
arguments.resize(idx);
|
||||||
|
found_healing_marker = true;
|
||||||
|
}
|
||||||
|
if (arguments == "\"") {
|
||||||
|
// This happens because of completing `:"$magic` after `"arguments"`
|
||||||
|
arguments = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return arguments;
|
||||||
|
}
|
||||||
|
if (is_content_path(path)) {
|
||||||
|
if (!j.is_string()) {
|
||||||
|
throw std::runtime_error("Content path must be a string");
|
||||||
|
}
|
||||||
|
std::string str = j;
|
||||||
|
auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
|
||||||
|
if (idx != std::string::npos) {
|
||||||
|
str.resize(idx);
|
||||||
|
found_healing_marker = true;
|
||||||
|
}
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
if (j.is_object()) {
|
||||||
|
auto obj = json::object();
|
||||||
|
for (const auto & p : j.items()) {
|
||||||
|
const auto & key = p.key();
|
||||||
|
const auto & value = p.value();
|
||||||
|
const std::string key_str = key; // NOLINT
|
||||||
|
auto idx = key_str.find(healing_marker_);
|
||||||
|
if (idx != std::string::npos) {
|
||||||
|
found_healing_marker = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
path.push_back(key_str);
|
||||||
|
if (value.is_string()) {
|
||||||
|
const std::string value_str = value;
|
||||||
|
if (value_str.find(healing_marker_) != std::string::npos) {
|
||||||
|
found_healing_marker = true;
|
||||||
|
if (is_content_path(path)) {
|
||||||
|
if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
|
||||||
|
// The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
|
||||||
|
obj[key] = remove_unsupported_healings_and_dump_args(value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
obj[key] = value;
|
||||||
|
} else {
|
||||||
|
obj[key] = remove_unsupported_healings_and_dump_args(value);
|
||||||
|
}
|
||||||
|
path.pop_back();
|
||||||
|
}
|
||||||
|
return obj;
|
||||||
|
}
|
||||||
|
if (j.is_array()) {
|
||||||
|
auto arr = json::array();
|
||||||
|
for (const auto & value : j) {
|
||||||
|
if (value.is_string()) {
|
||||||
|
std::string str = value;
|
||||||
|
auto idx = str.find(healing_marker_);
|
||||||
|
if (idx != std::string::npos) {
|
||||||
|
// Don't heal array values that aren't in the arguments.
|
||||||
|
found_healing_marker = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
arr.push_back(remove_unsupported_healings_and_dump_args(value));
|
||||||
|
}
|
||||||
|
return arr;
|
||||||
|
}
|
||||||
|
return j;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
|
||||||
|
LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
|
||||||
|
return consume_json_result {
|
||||||
|
cleaned,
|
||||||
|
/* .is_partial = */ found_healing_marker,
|
||||||
|
};
|
||||||
|
}
|
118
common/chat-parser.h
Normal file
118
common/chat-parser.h
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "chat.h"
|
||||||
|
#include "json-partial.h"
|
||||||
|
#include "regex-partial.h"
|
||||||
|
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
|
#include <optional>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
class common_chat_msg_partial_exception : public std::runtime_error {
|
||||||
|
public:
|
||||||
|
common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
class common_chat_msg_parser {
|
||||||
|
std::string input_;
|
||||||
|
bool is_partial_;
|
||||||
|
common_chat_syntax syntax_;
|
||||||
|
std::string healing_marker_;
|
||||||
|
|
||||||
|
size_t pos_ = 0;
|
||||||
|
common_chat_msg result_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||||
|
const std::string & input() const { return input_; }
|
||||||
|
size_t pos() const { return pos_; }
|
||||||
|
const std::string & healing_marker() const { return healing_marker_; }
|
||||||
|
const bool & is_partial() const { return is_partial_; }
|
||||||
|
const common_chat_msg & result() const { return result_; }
|
||||||
|
const common_chat_syntax & syntax() const { return syntax_; }
|
||||||
|
|
||||||
|
void move_to(size_t pos) {
|
||||||
|
if (pos > input_.size()) {
|
||||||
|
throw std::runtime_error("Invalid position!");
|
||||||
|
}
|
||||||
|
pos_ = pos;
|
||||||
|
}
|
||||||
|
void move_back(size_t n) {
|
||||||
|
if (pos_ < n) {
|
||||||
|
throw std::runtime_error("Can't move back that far!");
|
||||||
|
}
|
||||||
|
pos_ -= n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the substring of the input at the given range
|
||||||
|
std::string str(const common_string_range & rng) const;
|
||||||
|
|
||||||
|
// Appends to the result.content field
|
||||||
|
void add_content(const std::string & content);
|
||||||
|
|
||||||
|
// Appends to the result.reasoning_content field
|
||||||
|
void add_reasoning_content(const std::string & reasoning_content);
|
||||||
|
|
||||||
|
// Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
|
||||||
|
bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
|
||||||
|
|
||||||
|
// Adds a tool call using the "name", "id" and "arguments" fields of the json object
|
||||||
|
bool add_tool_call(const nlohmann::ordered_json & tool_call);
|
||||||
|
|
||||||
|
// Adds an array of tool calls using their "name", "id" and "arguments" fields.
|
||||||
|
bool add_tool_calls(const nlohmann::ordered_json & arr);
|
||||||
|
|
||||||
|
void finish();
|
||||||
|
|
||||||
|
bool consume_spaces();
|
||||||
|
|
||||||
|
void consume_literal(const std::string & literal);
|
||||||
|
|
||||||
|
bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
|
||||||
|
|
||||||
|
std::string consume_rest();
|
||||||
|
|
||||||
|
struct find_regex_result {
|
||||||
|
std::string prelude;
|
||||||
|
std::vector<common_string_range> groups;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
|
||||||
|
|
||||||
|
bool try_consume_literal(const std::string & literal);
|
||||||
|
|
||||||
|
std::optional<find_regex_result> try_find_literal(const std::string & literal);
|
||||||
|
|
||||||
|
find_regex_result consume_regex(const common_regex & regex);
|
||||||
|
|
||||||
|
std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
|
||||||
|
|
||||||
|
std::optional<common_json> try_consume_json();
|
||||||
|
common_json consume_json();
|
||||||
|
|
||||||
|
struct consume_json_result {
|
||||||
|
nlohmann::ordered_json value;
|
||||||
|
bool is_partial;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
|
||||||
|
|
||||||
|
By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
|
||||||
|
e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
|
||||||
|
|
||||||
|
But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
|
||||||
|
- with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
|
||||||
|
- with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
|
||||||
|
*/
|
||||||
|
consume_json_result consume_json_with_dumped_args(
|
||||||
|
const std::vector<std::vector<std::string>> & args_paths = {},
|
||||||
|
const std::vector<std::vector<std::string>> & content_paths = {}
|
||||||
|
);
|
||||||
|
std::optional<consume_json_result> try_consume_json_with_dumped_args(
|
||||||
|
const std::vector<std::vector<std::string>> & args_paths = {},
|
||||||
|
const std::vector<std::vector<std::string>> & content_paths = {}
|
||||||
|
);
|
||||||
|
};
|
1599
common/chat.cpp
1599
common/chat.cpp
File diff suppressed because it is too large
Load Diff
@ -3,6 +3,8 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include <functional>
|
||||||
|
#include <chrono>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -12,11 +14,19 @@ struct common_chat_tool_call {
|
|||||||
std::string name;
|
std::string name;
|
||||||
std::string arguments;
|
std::string arguments;
|
||||||
std::string id;
|
std::string id;
|
||||||
|
|
||||||
|
bool operator==(const common_chat_tool_call & other) const {
|
||||||
|
return name == other.name && arguments == other.arguments && id == other.id;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_chat_msg_content_part {
|
struct common_chat_msg_content_part {
|
||||||
std::string type;
|
std::string type;
|
||||||
std::string text;
|
std::string text;
|
||||||
|
|
||||||
|
bool operator==(const common_chat_msg_content_part & other) const {
|
||||||
|
return type == other.type && text == other.text;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_chat_msg {
|
struct common_chat_msg {
|
||||||
@ -27,6 +37,51 @@ struct common_chat_msg {
|
|||||||
std::string reasoning_content;
|
std::string reasoning_content;
|
||||||
std::string tool_name;
|
std::string tool_name;
|
||||||
std::string tool_call_id;
|
std::string tool_call_id;
|
||||||
|
|
||||||
|
template <class T> T to_json_oaicompat() const;
|
||||||
|
|
||||||
|
bool empty() const {
|
||||||
|
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
|
||||||
|
}
|
||||||
|
void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
|
||||||
|
for (auto i = 0u; i < tool_calls.size(); i++) {
|
||||||
|
if (ids_cache.size() <= i) {
|
||||||
|
auto id = tool_calls[i].id;
|
||||||
|
if (id.empty()) {
|
||||||
|
id = gen_tool_call_id();
|
||||||
|
}
|
||||||
|
ids_cache.push_back(id);
|
||||||
|
}
|
||||||
|
tool_calls[i].id = ids_cache[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bool operator==(const common_chat_msg & other) const {
|
||||||
|
return role == other.role
|
||||||
|
&& content == other.content
|
||||||
|
&& content_parts == other.content_parts
|
||||||
|
&& tool_calls == other.tool_calls
|
||||||
|
&& reasoning_content == other.reasoning_content
|
||||||
|
&& tool_name == other.tool_name
|
||||||
|
&& tool_call_id == other.tool_call_id;
|
||||||
|
}
|
||||||
|
bool operator!=(const common_chat_msg & other) const {
|
||||||
|
return !(*this == other);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_chat_msg_diff {
|
||||||
|
std::string reasoning_content_delta;
|
||||||
|
std::string content_delta;
|
||||||
|
size_t tool_call_index = std::string::npos;
|
||||||
|
common_chat_tool_call tool_call_delta;
|
||||||
|
|
||||||
|
static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
|
||||||
|
|
||||||
|
bool operator==(const common_chat_msg_diff & other) const {
|
||||||
|
return content_delta == other.content_delta
|
||||||
|
&& tool_call_index == other.tool_call_index
|
||||||
|
&& tool_call_delta == other.tool_call_delta;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_chat_tool {
|
struct common_chat_tool {
|
||||||
@ -48,14 +103,11 @@ enum common_chat_format {
|
|||||||
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
||||||
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
||||||
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
||||||
COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
|
|
||||||
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
||||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
||||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
||||||
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
||||||
COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
|
|
||||||
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
||||||
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
|
|
||||||
|
|
||||||
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
||||||
};
|
};
|
||||||
@ -70,7 +122,9 @@ struct common_chat_templates_inputs {
|
|||||||
std::vector<common_chat_tool> tools;
|
std::vector<common_chat_tool> tools;
|
||||||
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||||
bool parallel_tool_calls = false;
|
bool parallel_tool_calls = false;
|
||||||
bool extract_reasoning = true;
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||||
|
bool enable_thinking = true;
|
||||||
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_chat_params {
|
struct common_chat_params {
|
||||||
@ -78,11 +132,21 @@ struct common_chat_params {
|
|||||||
std::string prompt;
|
std::string prompt;
|
||||||
std::string grammar;
|
std::string grammar;
|
||||||
bool grammar_lazy = false;
|
bool grammar_lazy = false;
|
||||||
|
bool thinking_forced_open = false;
|
||||||
std::vector<common_grammar_trigger> grammar_triggers;
|
std::vector<common_grammar_trigger> grammar_triggers;
|
||||||
std::vector<std::string> preserved_tokens;
|
std::vector<std::string> preserved_tokens;
|
||||||
std::vector<std::string> additional_stops;
|
std::vector<std::string> additional_stops;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct common_chat_syntax {
|
||||||
|
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||||
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||||
|
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
|
||||||
|
bool reasoning_in_content = false;
|
||||||
|
bool thinking_forced_open = false;
|
||||||
|
bool parse_tool_calls = true;
|
||||||
|
};
|
||||||
|
|
||||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
|
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
|
||||||
|
|
||||||
@ -119,8 +183,9 @@ std::string common_chat_format_example(
|
|||||||
const struct common_chat_templates * tmpls,
|
const struct common_chat_templates * tmpls,
|
||||||
bool use_jinja);
|
bool use_jinja);
|
||||||
|
|
||||||
std::string common_chat_format_name(common_chat_format format);
|
const char* common_chat_format_name(common_chat_format format);
|
||||||
common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
|
const char* common_reasoning_format_name(common_reasoning_format format);
|
||||||
|
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||||
|
|
||||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
||||||
|
|
||||||
@ -133,3 +198,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
|
|||||||
// T can be std::string containing JSON or nlohmann::ordered_json
|
// T can be std::string containing JSON or nlohmann::ordered_json
|
||||||
template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
|
template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
|
||||||
template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
|
template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
|
||||||
|
|
||||||
|
template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
|
||||||
|
@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
|||||||
|
|
||||||
DWORD p = NORMAL_PRIORITY_CLASS;
|
DWORD p = NORMAL_PRIORITY_CLASS;
|
||||||
switch (prio) {
|
switch (prio) {
|
||||||
|
case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
|
||||||
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
||||||
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
||||||
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
||||||
@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
|||||||
|
|
||||||
int p = 0;
|
int p = 0;
|
||||||
switch (prio) {
|
switch (prio) {
|
||||||
|
case GGML_SCHED_PRIO_LOW: p = 5; break;
|
||||||
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
||||||
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
||||||
case GGML_SCHED_PRIO_HIGH: p = -10; break;
|
case GGML_SCHED_PRIO_HIGH: p = -10; break;
|
||||||
@ -443,6 +445,25 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
|||||||
s = std::move(builder);
|
s = std::move(builder);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
|
||||||
|
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
||||||
|
}
|
||||||
|
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
|
||||||
|
if (!str.empty() && !stop.empty()) {
|
||||||
|
const char text_last_char = str.back();
|
||||||
|
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
|
||||||
|
if (stop[char_index] == text_last_char) {
|
||||||
|
const auto current_partial = stop.substr(0, char_index + 1);
|
||||||
|
if (string_ends_with(str, current_partial)) {
|
||||||
|
return str.size() - char_index - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::string::npos;
|
||||||
|
}
|
||||||
|
|
||||||
std::string regex_escape(const std::string & s) {
|
std::string regex_escape(const std::string & s) {
|
||||||
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
||||||
return std::regex_replace(s, special_chars, "\\$0");
|
return std::regex_replace(s, special_chars, "\\$0");
|
||||||
@ -830,7 +851,7 @@ std::string fs_get_cache_directory() {
|
|||||||
if (getenv("LLAMA_CACHE")) {
|
if (getenv("LLAMA_CACHE")) {
|
||||||
cache_directory = std::getenv("LLAMA_CACHE");
|
cache_directory = std::getenv("LLAMA_CACHE");
|
||||||
} else {
|
} else {
|
||||||
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
|
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
|
||||||
if (std::getenv("XDG_CACHE_HOME")) {
|
if (std::getenv("XDG_CACHE_HOME")) {
|
||||||
cache_directory = std::getenv("XDG_CACHE_HOME");
|
cache_directory = std::getenv("XDG_CACHE_HOME");
|
||||||
} else {
|
} else {
|
||||||
@ -884,13 +905,16 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||||
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
|
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||||
ok = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
|
if (!has_eos && !has_sep) {
|
||||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||||
|
ok = false;
|
||||||
|
} else if (!has_eos) {
|
||||||
|
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||||
|
} else if (!has_sep) {
|
||||||
|
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -910,7 +934,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
|
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
||||||
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
||||||
params.ctx_shift = false;
|
params.ctx_shift = false;
|
||||||
}
|
}
|
||||||
@ -1017,7 +1041,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
if (llama_model_has_decoder(model)) {
|
if (llama_model_has_decoder(model)) {
|
||||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
||||||
}
|
}
|
||||||
llama_kv_self_clear(lctx);
|
llama_memory_clear(llama_get_memory(lctx), true);
|
||||||
llama_synchronize(lctx);
|
llama_synchronize(lctx);
|
||||||
llama_perf_context_reset(lctx);
|
llama_perf_context_reset(lctx);
|
||||||
llama_set_warmup(lctx, false);
|
llama_set_warmup(lctx, false);
|
||||||
@ -1083,6 +1107,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|||||||
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
|
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mparams.progress_callback = params.load_progress_callback;
|
||||||
|
mparams.progress_callback_user_data = params.load_progress_callback_user_data;
|
||||||
|
|
||||||
return mparams;
|
return mparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1096,7 +1123,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|||||||
cparams.n_threads = params.cpuparams.n_threads;
|
cparams.n_threads = params.cpuparams.n_threads;
|
||||||
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
||||||
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
||||||
cparams.logits_all = params.logits_all;
|
|
||||||
cparams.embeddings = params.embedding;
|
cparams.embeddings = params.embedding;
|
||||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||||
cparams.rope_freq_base = params.rope_freq_base;
|
cparams.rope_freq_base = params.rope_freq_base;
|
||||||
@ -1114,6 +1140,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|||||||
cparams.offload_kqv = !params.no_kv_offload;
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
cparams.flash_attn = params.flash_attn;
|
cparams.flash_attn = params.flash_attn;
|
||||||
cparams.no_perf = params.no_perf;
|
cparams.no_perf = params.no_perf;
|
||||||
|
cparams.op_offload = !params.no_op_offload;
|
||||||
|
cparams.swa_full = params.swa_full;
|
||||||
|
|
||||||
if (params.reranking) {
|
if (params.reranking) {
|
||||||
cparams.embeddings = true;
|
cparams.embeddings = true;
|
||||||
@ -1306,81 +1334,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
|
|||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// KV cache utils
|
|
||||||
//
|
|
||||||
|
|
||||||
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|
||||||
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
|
||||||
|
|
||||||
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
|
||||||
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
|
||||||
|
|
||||||
llama_kv_cache_view_cell * c_curr = view.cells;
|
|
||||||
llama_seq_id * cs_curr = view.cells_sequences;
|
|
||||||
|
|
||||||
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
|
||||||
if (i % row_size == 0) {
|
|
||||||
printf("\n%5d: ", i);
|
|
||||||
}
|
|
||||||
int seq_count = 0;
|
|
||||||
for (int j = 0; j < view.n_seq_max; j++) {
|
|
||||||
if (cs_curr[j] >= 0) { seq_count++; }
|
|
||||||
}
|
|
||||||
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("\n=== Done dumping\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
|
||||||
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
|
||||||
|
|
||||||
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
|
||||||
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
|
||||||
|
|
||||||
std::unordered_map<llama_seq_id, size_t> seqs;
|
|
||||||
llama_kv_cache_view_cell * c_curr = view.cells;
|
|
||||||
llama_seq_id * cs_curr = view.cells_sequences;
|
|
||||||
|
|
||||||
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
|
||||||
for (int j = 0; j < view.n_seq_max; j++) {
|
|
||||||
if (cs_curr[j] < 0) { continue; }
|
|
||||||
if (seqs.find(cs_curr[j]) == seqs.end()) {
|
|
||||||
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
|
||||||
const size_t sz = seqs.size();
|
|
||||||
seqs[cs_curr[j]] = sz;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("=== Sequence legend: ");
|
|
||||||
for (const auto & it : seqs) {
|
|
||||||
printf("%zu=%d, ", it.second, it.first);
|
|
||||||
}
|
|
||||||
printf("'+'=other sequence ids");
|
|
||||||
|
|
||||||
c_curr = view.cells;
|
|
||||||
cs_curr = view.cells_sequences;
|
|
||||||
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
|
||||||
if (i % row_size == 0) {
|
|
||||||
printf("\n%5d: ", i);
|
|
||||||
}
|
|
||||||
for (int j = 0; j < view.n_seq_max; j++) {
|
|
||||||
if (cs_curr[j] >= 0) {
|
|
||||||
const auto & it = seqs.find(cs_curr[j]);
|
|
||||||
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
|
|
||||||
} else {
|
|
||||||
putchar('.');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
putchar(' ');
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("\n=== Done dumping\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
//
|
//
|
||||||
@ -1565,3 +1518,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
|||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
|
||||||
|
const int64_t ne_datapoint = llama_n_ctx(ctx);
|
||||||
|
const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
|
||||||
|
ggml_opt_dataset_t result = ggml_opt_dataset_init(
|
||||||
|
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
|
||||||
|
|
||||||
|
llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
|
||||||
|
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
|
||||||
|
|
||||||
|
for (int64_t idata = 0; idata < ndata; ++idata) {
|
||||||
|
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
|
||||||
|
memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
@ -66,7 +67,6 @@ enum llama_example {
|
|||||||
LLAMA_EXAMPLE_COMMON,
|
LLAMA_EXAMPLE_COMMON,
|
||||||
LLAMA_EXAMPLE_SPECULATIVE,
|
LLAMA_EXAMPLE_SPECULATIVE,
|
||||||
LLAMA_EXAMPLE_MAIN,
|
LLAMA_EXAMPLE_MAIN,
|
||||||
LLAMA_EXAMPLE_INFILL,
|
|
||||||
LLAMA_EXAMPLE_EMBEDDING,
|
LLAMA_EXAMPLE_EMBEDDING,
|
||||||
LLAMA_EXAMPLE_PERPLEXITY,
|
LLAMA_EXAMPLE_PERPLEXITY,
|
||||||
LLAMA_EXAMPLE_RETRIEVAL,
|
LLAMA_EXAMPLE_RETRIEVAL,
|
||||||
@ -76,7 +76,7 @@ enum llama_example {
|
|||||||
LLAMA_EXAMPLE_SERVER,
|
LLAMA_EXAMPLE_SERVER,
|
||||||
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
||||||
LLAMA_EXAMPLE_EXPORT_LORA,
|
LLAMA_EXAMPLE_EXPORT_LORA,
|
||||||
LLAMA_EXAMPLE_LLAVA,
|
LLAMA_EXAMPLE_MTMD,
|
||||||
LLAMA_EXAMPLE_LOOKUP,
|
LLAMA_EXAMPLE_LOOKUP,
|
||||||
LLAMA_EXAMPLE_PARALLEL,
|
LLAMA_EXAMPLE_PARALLEL,
|
||||||
LLAMA_EXAMPLE_TTS,
|
LLAMA_EXAMPLE_TTS,
|
||||||
@ -96,6 +96,7 @@ enum common_sampler_type {
|
|||||||
COMMON_SAMPLER_TYPE_XTC = 8,
|
COMMON_SAMPLER_TYPE_XTC = 8,
|
||||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||||
|
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
||||||
};
|
};
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
@ -114,7 +115,7 @@ enum common_grammar_trigger_type {
|
|||||||
COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
|
COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
|
||||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
||||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_grammar_trigger {
|
struct common_grammar_trigger {
|
||||||
@ -161,6 +162,7 @@ struct common_params_sampling {
|
|||||||
std::vector<enum common_sampler_type> samplers = {
|
std::vector<enum common_sampler_type> samplers = {
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||||
COMMON_SAMPLER_TYPE_DRY,
|
COMMON_SAMPLER_TYPE_DRY,
|
||||||
|
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
|
||||||
COMMON_SAMPLER_TYPE_TOP_K,
|
COMMON_SAMPLER_TYPE_TOP_K,
|
||||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||||
COMMON_SAMPLER_TYPE_TOP_P,
|
COMMON_SAMPLER_TYPE_TOP_P,
|
||||||
@ -213,7 +215,8 @@ struct common_params_vocoder {
|
|||||||
|
|
||||||
enum common_reasoning_format {
|
enum common_reasoning_format {
|
||||||
COMMON_REASONING_FORMAT_NONE,
|
COMMON_REASONING_FORMAT_NONE,
|
||||||
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
|
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
||||||
|
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_params {
|
struct common_params {
|
||||||
@ -289,6 +292,7 @@ struct common_params {
|
|||||||
int32_t verbosity = 0;
|
int32_t verbosity = 0;
|
||||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||||
|
bool offline = false;
|
||||||
|
|
||||||
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||||
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||||
@ -321,17 +325,17 @@ struct common_params {
|
|||||||
bool flash_attn = false; // flash attention
|
bool flash_attn = false; // flash attention
|
||||||
bool no_perf = false; // disable performance metrics
|
bool no_perf = false; // disable performance metrics
|
||||||
bool ctx_shift = true; // context shift on inifinite text generation
|
bool ctx_shift = true; // context shift on inifinite text generation
|
||||||
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool display_prompt = true; // print prompt before generation
|
bool display_prompt = true; // print prompt before generation
|
||||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
|
||||||
bool no_kv_offload = false; // disable KV offloading
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
bool warmup = true; // warmup run
|
bool warmup = true; // warmup run
|
||||||
bool check_tensors = false; // validate tensor data
|
bool check_tensors = false; // validate tensor data
|
||||||
|
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
||||||
|
|
||||||
bool single_turn = false; // single turn chat conversation
|
bool single_turn = false; // single turn chat conversation
|
||||||
|
|
||||||
@ -340,7 +344,7 @@ struct common_params {
|
|||||||
|
|
||||||
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see tools/mtmd)
|
||||||
struct common_params_model mmproj;
|
struct common_params_model mmproj;
|
||||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||||
bool no_mmproj = false; // explicitly disable multimodal model
|
bool no_mmproj = false; // explicitly disable multimodal model
|
||||||
@ -366,6 +370,8 @@ struct common_params {
|
|||||||
bool use_jinja = false; // NOLINT
|
bool use_jinja = false; // NOLINT
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||||
|
int reasoning_budget = -1;
|
||||||
|
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||||
|
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
|
|
||||||
@ -409,13 +415,14 @@ struct common_params {
|
|||||||
|
|
||||||
bool process_output = false; // collect data for the output tensor
|
bool process_output = false; // collect data for the output tensor
|
||||||
bool compute_ppl = true; // whether to compute perplexity
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
|
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
||||||
|
|
||||||
// cvector-generator params
|
// cvector-generator params
|
||||||
int n_pca_batch = 100;
|
int n_pca_batch = 100;
|
||||||
int n_pca_iterations = 1000;
|
int n_pca_iterations = 1000;
|
||||||
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
|
||||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
|
||||||
|
|
||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
@ -424,6 +431,11 @@ struct common_params {
|
|||||||
|
|
||||||
// common params
|
// common params
|
||||||
std::string out_file; // output filename for all example programs
|
std::string out_file; // output filename for all example programs
|
||||||
|
// optional callback for model loading progress and cancellation:
|
||||||
|
// called with a progress value between 0.0 and 1.0.
|
||||||
|
// return false from callback to abort model loading or true to continue
|
||||||
|
llama_progress_callback load_progress_callback = NULL;
|
||||||
|
void * load_progress_callback_user_data = NULL;
|
||||||
};
|
};
|
||||||
|
|
||||||
// call once at the start of a program if it uses libcommon
|
// call once at the start of a program if it uses libcommon
|
||||||
@ -501,10 +513,9 @@ static bool string_starts_with(const std::string & str,
|
|||||||
return str.rfind(prefix, 0) == 0;
|
return str.rfind(prefix, 0) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool string_ends_with(const std::string & str,
|
// While we wait for C++20's std::string::ends_with...
|
||||||
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
|
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
|
||||||
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
|
||||||
}
|
|
||||||
|
|
||||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||||
void string_process_escapes(std::string & input);
|
void string_process_escapes(std::string & input);
|
||||||
@ -613,16 +624,6 @@ std::string common_detokenize(
|
|||||||
const std::vector<llama_token> & tokens,
|
const std::vector<llama_token> & tokens,
|
||||||
bool special = true);
|
bool special = true);
|
||||||
|
|
||||||
//
|
|
||||||
// KV cache utils
|
|
||||||
//
|
|
||||||
|
|
||||||
// Dump the KV cache view with the number of sequences per cell.
|
|
||||||
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
|
||||||
|
|
||||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
|
||||||
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
//
|
//
|
||||||
@ -664,3 +665,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
|||||||
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// training utils
|
||||||
|
//
|
||||||
|
|
||||||
|
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
||||||
|
256
common/json-partial.cpp
Normal file
256
common/json-partial.cpp
Normal file
@ -0,0 +1,256 @@
|
|||||||
|
#include "json-partial.h"
|
||||||
|
|
||||||
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
|
enum common_json_stack_element_type {
|
||||||
|
COMMON_JSON_STACK_ELEMENT_OBJECT,
|
||||||
|
COMMON_JSON_STACK_ELEMENT_KEY,
|
||||||
|
COMMON_JSON_STACK_ELEMENT_ARRAY,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_json_stack_element {
|
||||||
|
common_json_stack_element_type type;
|
||||||
|
std::string key;
|
||||||
|
};
|
||||||
|
|
||||||
|
bool common_json_parse(
|
||||||
|
const std::string & input,
|
||||||
|
const std::string & healing_marker,
|
||||||
|
common_json & out)
|
||||||
|
{
|
||||||
|
std::string::const_iterator it = input.begin();
|
||||||
|
const auto end = input.end();
|
||||||
|
return common_json_parse(it, end, healing_marker, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool common_json_parse(
|
||||||
|
std::string::const_iterator & it,
|
||||||
|
const std::string::const_iterator & end,
|
||||||
|
const std::string & healing_marker,
|
||||||
|
common_json & out)
|
||||||
|
{
|
||||||
|
// // https://json.nlohmann.me/features/parsing/sax_interface/
|
||||||
|
struct json_error_locator : public nlohmann::json_sax<json> {
|
||||||
|
std::size_t position;
|
||||||
|
bool found_error;
|
||||||
|
std::string last_token;
|
||||||
|
std::string exception_message;
|
||||||
|
std::vector<common_json_stack_element> stack;
|
||||||
|
|
||||||
|
json_error_locator() : position(0), found_error(false) {}
|
||||||
|
|
||||||
|
bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
|
||||||
|
this->position = position - 1;
|
||||||
|
this->found_error = true;
|
||||||
|
this->last_token = last_token;
|
||||||
|
this->exception_message = ex.what();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
void close_value() {
|
||||||
|
if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
|
||||||
|
stack.pop_back();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bool null() override { // NOLINT
|
||||||
|
close_value();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool boolean(bool) override { // NOLINT
|
||||||
|
close_value();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool number_integer(number_integer_t) override { // NOLINT
|
||||||
|
close_value();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool number_unsigned(number_unsigned_t) override { // NOLINT
|
||||||
|
close_value();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool number_float(number_float_t, const string_t &) override { // NOLINT
|
||||||
|
close_value();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool string(string_t &) override { // NOLINT
|
||||||
|
close_value();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool binary(binary_t &) override { // NOLINT
|
||||||
|
close_value();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool start_object(std::size_t) override { // NOLINT
|
||||||
|
stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool end_object() override {
|
||||||
|
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
|
||||||
|
stack.pop_back();
|
||||||
|
close_value();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool key(string_t & key) override { // NOLINT
|
||||||
|
stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool start_array(std::size_t) override { // NOLINT
|
||||||
|
stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool end_array() override {
|
||||||
|
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
|
||||||
|
stack.pop_back();
|
||||||
|
close_value();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
json_error_locator err_loc;
|
||||||
|
auto start = it;
|
||||||
|
json::sax_parse(it, end, &err_loc);
|
||||||
|
|
||||||
|
if (err_loc.found_error) {
|
||||||
|
it = start;
|
||||||
|
auto temptative_end = it + err_loc.position;
|
||||||
|
// LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
|
||||||
|
|
||||||
|
auto input = std::string(it, temptative_end);
|
||||||
|
try {
|
||||||
|
out.json = json::parse(input);
|
||||||
|
// out.json = json::parse(it, temptative_end);
|
||||||
|
it = temptative_end;
|
||||||
|
return true;
|
||||||
|
} catch (const std::exception & ex) {
|
||||||
|
// No, needs healing.
|
||||||
|
LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
|
||||||
|
}
|
||||||
|
auto can_parse = [](const std::string & str) {
|
||||||
|
try {
|
||||||
|
auto _ = json::parse(str); // NOLINT
|
||||||
|
return true;
|
||||||
|
} catch (const std::exception &) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if (!healing_marker.empty() && !err_loc.stack.empty()) {
|
||||||
|
std::string str(it, temptative_end);
|
||||||
|
auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
|
||||||
|
if (last_non_sp_pos == std::string::npos) {
|
||||||
|
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
|
||||||
|
}
|
||||||
|
auto last_non_sp_char = str[last_non_sp_pos];
|
||||||
|
// Used to detect stops on a number, which may not be complete.
|
||||||
|
auto was_maybe_number = [&]() {
|
||||||
|
if (!str.empty() && std::isspace(str.back())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return std::isdigit(last_non_sp_char) ||
|
||||||
|
last_non_sp_char == '.' ||
|
||||||
|
last_non_sp_char == 'e' ||
|
||||||
|
last_non_sp_char == 'E' ||
|
||||||
|
last_non_sp_char == '-';
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string closing;
|
||||||
|
for (size_t i = err_loc.stack.size(); i > 0; i--) {
|
||||||
|
auto & el = err_loc.stack[i - 1];
|
||||||
|
if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
|
||||||
|
closing += "}";
|
||||||
|
} else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
|
||||||
|
closing += "]";
|
||||||
|
} else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
|
||||||
|
throw std::runtime_error("Unexpected stack element type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
|
||||||
|
|
||||||
|
if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
|
||||||
|
// We're inside an object value
|
||||||
|
if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
|
||||||
|
// Was about to create an object value
|
||||||
|
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||||
|
} else if (can_parse(str + ": 1" + closing)) {
|
||||||
|
str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
|
||||||
|
} else if (last_non_sp_char == '{' && can_parse(str + closing)) {
|
||||||
|
// Was about to create an object
|
||||||
|
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
|
||||||
|
} else if (can_parse(str + "\"" + closing)) {
|
||||||
|
// Was inside an object value string
|
||||||
|
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
|
||||||
|
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
||||||
|
// Was inside an object value string after an escape
|
||||||
|
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
||||||
|
} else {
|
||||||
|
// find last :
|
||||||
|
auto last_pos = str.find_last_of(':');
|
||||||
|
if (last_pos == std::string::npos) {
|
||||||
|
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
|
||||||
|
}
|
||||||
|
// Cutting back to opening : for object value
|
||||||
|
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||||
|
}
|
||||||
|
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
|
||||||
|
if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
|
||||||
|
// Was about to create an array value
|
||||||
|
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||||
|
} else if (can_parse(str + "\"" + closing)) {
|
||||||
|
// Was inside an array value string
|
||||||
|
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
|
||||||
|
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
||||||
|
// Was inside an array value string after an escape
|
||||||
|
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
||||||
|
} else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
|
||||||
|
// Had just finished a value
|
||||||
|
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
|
||||||
|
} else {
|
||||||
|
auto last_pos = str.find_last_of("[,");
|
||||||
|
if (last_pos == std::string::npos) {
|
||||||
|
throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
|
||||||
|
}
|
||||||
|
// Cutting back to last [ or , for array value
|
||||||
|
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||||
|
}
|
||||||
|
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
|
||||||
|
if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
|
||||||
|
(last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
|
||||||
|
// Was about to create an object key+value
|
||||||
|
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
|
||||||
|
} else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
|
||||||
|
// Was about to create an object key+value
|
||||||
|
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
|
||||||
|
} else if (can_parse(str + "\": 1" + closing)) {
|
||||||
|
// Was inside an object key string
|
||||||
|
str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
|
||||||
|
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
|
||||||
|
// Was inside an object key string after an escape
|
||||||
|
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
|
||||||
|
} else {
|
||||||
|
auto last_pos = str.find_last_of(':');
|
||||||
|
if (last_pos == std::string::npos) {
|
||||||
|
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
|
||||||
|
}
|
||||||
|
// fprintf(stderr, "Cutting back to last : for object key+value\n");
|
||||||
|
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
|
||||||
|
}
|
||||||
|
// fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
|
||||||
|
out.json = json::parse(str);
|
||||||
|
it = temptative_end;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
|
||||||
|
// fprintf(stderr, "Closing: TODO\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
out.json = json::parse(it, end);
|
||||||
|
it = end;
|
||||||
|
return true;
|
||||||
|
}
|
38
common/json-partial.h
Normal file
38
common/json-partial.h
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
|
// Healing marker (empty if the JSON was fully parsed / wasn't healed).
|
||||||
|
struct common_healing_marker {
|
||||||
|
// Raw marker.
|
||||||
|
std::string marker;
|
||||||
|
|
||||||
|
// Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
|
||||||
|
std::string json_dump_marker;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
|
||||||
|
struct common_json {
|
||||||
|
nlohmann::ordered_json json;
|
||||||
|
|
||||||
|
common_healing_marker healing_marker;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
|
||||||
|
//
|
||||||
|
// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
|
||||||
|
// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
|
||||||
|
// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
|
||||||
|
//
|
||||||
|
// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
|
||||||
|
bool common_json_parse(
|
||||||
|
const std::string & input,
|
||||||
|
const std::string & healing_marker,
|
||||||
|
common_json & out);
|
||||||
|
|
||||||
|
// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
|
||||||
|
bool common_json_parse(
|
||||||
|
std::string::const_iterator & it,
|
||||||
|
const std::string::const_iterator & end,
|
||||||
|
const std::string & healing_marker,
|
||||||
|
common_json & out);
|
@ -1,8 +1,9 @@
|
|||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <fstream>
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include <nlohmann/json_fwd.hpp>
|
||||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
||||||
#define JSON_ASSERT GGML_ASSERT
|
#include <functional>
|
||||||
#include "json.hpp"
|
#include <string>
|
||||||
|
|
||||||
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
||||||
bool force_gbnf = false);
|
bool force_gbnf = false);
|
||||||
|
@ -189,6 +189,7 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)
|
|||||||
/* .tokenize_fn = */ llama_sampler_llg_tokenize_fn,
|
/* .tokenize_fn = */ llama_sampler_llg_tokenize_fn,
|
||||||
/* .use_approximate_greedy_tokenize_fn = */ false,
|
/* .use_approximate_greedy_tokenize_fn = */ false,
|
||||||
/* .tokenize_user_data = */ vocab,
|
/* .tokenize_user_data = */ vocab,
|
||||||
|
/* .slices = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
char error_buffer[1024];
|
char error_buffer[1024];
|
||||||
|
204
common/regex-partial.cpp
Normal file
204
common/regex-partial.cpp
Normal file
@ -0,0 +1,204 @@
|
|||||||
|
#include "regex-partial.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include <functional>
|
||||||
|
#include <optional>
|
||||||
|
|
||||||
|
common_regex::common_regex(const std::string & pattern) :
|
||||||
|
pattern(pattern),
|
||||||
|
rx(pattern),
|
||||||
|
rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {}
|
||||||
|
|
||||||
|
common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const {
|
||||||
|
std::smatch match;
|
||||||
|
if (pos > input.size()) {
|
||||||
|
throw std::runtime_error("Position out of bounds");
|
||||||
|
}
|
||||||
|
auto start = input.begin() + pos;
|
||||||
|
auto found = as_match
|
||||||
|
? std::regex_match(start, input.end(), match, rx)
|
||||||
|
: std::regex_search(start, input.end(), match, rx);
|
||||||
|
if (found) {
|
||||||
|
common_regex_match res;
|
||||||
|
res.type = COMMON_REGEX_MATCH_TYPE_FULL;
|
||||||
|
for (size_t i = 0; i < match.size(); ++i) {
|
||||||
|
auto begin = pos + match.position(i);
|
||||||
|
res.groups.emplace_back(begin, begin + match.length(i));
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
std::match_results<std::string::const_reverse_iterator> srmatch;
|
||||||
|
if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
|
||||||
|
auto group = srmatch[1].str();
|
||||||
|
if (group.length() != 0) {
|
||||||
|
auto it = srmatch[1].second.base();
|
||||||
|
// auto position = static_cast<size_t>(std::distance(input.begin(), it));
|
||||||
|
if ((!as_match) || it == input.begin()) {
|
||||||
|
common_regex_match res;
|
||||||
|
res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
|
||||||
|
const size_t begin = std::distance(input.begin(), it);
|
||||||
|
const size_t end = input.size();
|
||||||
|
if (begin == std::string::npos || end == std::string::npos || begin > end) {
|
||||||
|
throw std::runtime_error("Invalid range");
|
||||||
|
}
|
||||||
|
res.groups.push_back({begin, end});
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
|
||||||
|
|
||||||
|
Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
|
||||||
|
to see if a string ends with a partial regex match, but but it's not in std::regex yet.
|
||||||
|
Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
|
||||||
|
|
||||||
|
- /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
|
||||||
|
- /a|b/ -> (a|b).*
|
||||||
|
- /a*?/ -> error, could match ""
|
||||||
|
- /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
|
||||||
|
- /.*?ab/ -> ((?:b)?a).* (merge .*)
|
||||||
|
- /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
|
||||||
|
- /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
|
||||||
|
- /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
|
||||||
|
- /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
|
||||||
|
|
||||||
|
The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
|
||||||
|
(i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
|
||||||
|
*/
|
||||||
|
std::string regex_to_reversed_partial_regex(const std::string & pattern) {
|
||||||
|
auto it = pattern.begin();
|
||||||
|
const auto end = pattern.end();
|
||||||
|
|
||||||
|
std::function<std::string()> process = [&]() {
|
||||||
|
std::vector<std::vector<std::string>> alternatives(1);
|
||||||
|
std::vector<std::string> * sequence = &alternatives.back();
|
||||||
|
|
||||||
|
while (it != end) {
|
||||||
|
if (*it == '[') {
|
||||||
|
auto start = it;
|
||||||
|
++it;
|
||||||
|
while (it != end) {
|
||||||
|
if ((*it == '\\') && (++it != end)) {
|
||||||
|
++it;
|
||||||
|
} else if ((it != end) && (*it == ']')) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
++it;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (it == end) {
|
||||||
|
throw std::runtime_error("Unmatched '[' in pattern");
|
||||||
|
}
|
||||||
|
++it;
|
||||||
|
sequence->push_back(std::string(start, it));
|
||||||
|
} else if (*it == '*' || *it == '?' || *it == '+') {
|
||||||
|
if (sequence->empty()) {
|
||||||
|
throw std::runtime_error("Quantifier without preceding element");
|
||||||
|
}
|
||||||
|
sequence->back() += *it;
|
||||||
|
auto is_star = *it == '*';
|
||||||
|
++it;
|
||||||
|
if (is_star) {
|
||||||
|
if (*it == '?') {
|
||||||
|
++it;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (*it == '{') {
|
||||||
|
if (sequence->empty()) {
|
||||||
|
throw std::runtime_error("Repetition without preceding element");
|
||||||
|
}
|
||||||
|
++it;
|
||||||
|
auto start = it;
|
||||||
|
while (it != end && *it != '}') {
|
||||||
|
++it;
|
||||||
|
}
|
||||||
|
if (it == end) {
|
||||||
|
throw std::runtime_error("Unmatched '{' in pattern");
|
||||||
|
}
|
||||||
|
auto parts = string_split(std::string(start, it), ",");
|
||||||
|
++it;
|
||||||
|
if (parts.size() > 2) {
|
||||||
|
throw std::runtime_error("Invalid repetition range in pattern");
|
||||||
|
}
|
||||||
|
|
||||||
|
auto parseOptInt = [&](const std::string & s, const std::optional<int> & def = std::nullopt) -> std::optional<int> {
|
||||||
|
if (s.empty()) {
|
||||||
|
return def;
|
||||||
|
}
|
||||||
|
return std::stoi(s);
|
||||||
|
};
|
||||||
|
auto min = parseOptInt(parts[0], 0);
|
||||||
|
auto max = parts.size() == 1 ? min : parseOptInt(parts[1]);
|
||||||
|
if (min && max && *max < *min) {
|
||||||
|
throw std::runtime_error("Invalid repetition range in pattern");
|
||||||
|
}
|
||||||
|
// Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded)
|
||||||
|
auto part = sequence->back();
|
||||||
|
sequence->pop_back();
|
||||||
|
for (int i = 0; i < *min; i++) {
|
||||||
|
sequence->push_back(part);
|
||||||
|
}
|
||||||
|
if (max) {
|
||||||
|
for (int i = *min; i < *max; i++) {
|
||||||
|
sequence->push_back(part + "?");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sequence->push_back(part + "*");
|
||||||
|
}
|
||||||
|
} else if (*it == '(') {
|
||||||
|
++it;
|
||||||
|
if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') {
|
||||||
|
it += 2;
|
||||||
|
}
|
||||||
|
auto sub = process();
|
||||||
|
if (*it != ')') {
|
||||||
|
throw std::runtime_error("Unmatched '(' in pattern");
|
||||||
|
}
|
||||||
|
++it;
|
||||||
|
auto & part = sequence->emplace_back("(?:");
|
||||||
|
part += sub;
|
||||||
|
part += ")";
|
||||||
|
} else if (*it == ')') {
|
||||||
|
break;
|
||||||
|
} else if (*it == '|') {
|
||||||
|
++it;
|
||||||
|
alternatives.emplace_back();
|
||||||
|
sequence = &alternatives.back();
|
||||||
|
} else if (*it == '\\' && (++it != end)) {
|
||||||
|
auto str = std::string("\\") + *it;
|
||||||
|
sequence->push_back(str);
|
||||||
|
++it;
|
||||||
|
} else if (it != end) {
|
||||||
|
sequence->push_back(std::string(1, *it));
|
||||||
|
++it;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
|
||||||
|
// if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
|
||||||
|
// We'll do the outermost capturing group and final .* in the enclosing function.
|
||||||
|
std::vector<std::string> res_alts;
|
||||||
|
for (const auto & parts : alternatives) {
|
||||||
|
auto & res = res_alts.emplace_back();
|
||||||
|
for (size_t i = 0; i < parts.size() - 1; i++) {
|
||||||
|
res += "(?:";
|
||||||
|
}
|
||||||
|
for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
|
||||||
|
res += *it;
|
||||||
|
if (it != parts.rend() - 1) {
|
||||||
|
res += ")?";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return string_join(res_alts, "|");
|
||||||
|
};
|
||||||
|
auto res = process();
|
||||||
|
if (it != end) {
|
||||||
|
throw std::runtime_error("Unmatched '(' in pattern");
|
||||||
|
}
|
||||||
|
|
||||||
|
return "(" + res + ")[\\s\\S]*";
|
||||||
|
}
|
56
common/regex-partial.h
Normal file
56
common/regex-partial.h
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <regex>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
enum common_regex_match_type {
|
||||||
|
COMMON_REGEX_MATCH_TYPE_NONE,
|
||||||
|
COMMON_REGEX_MATCH_TYPE_PARTIAL,
|
||||||
|
COMMON_REGEX_MATCH_TYPE_FULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_string_range {
|
||||||
|
size_t begin;
|
||||||
|
size_t end;
|
||||||
|
common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
|
||||||
|
if (begin > end) {
|
||||||
|
throw std::runtime_error("Invalid range");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// prevent default ctor
|
||||||
|
common_string_range() = delete;
|
||||||
|
bool empty() const {
|
||||||
|
return begin == end;
|
||||||
|
}
|
||||||
|
bool operator==(const common_string_range & other) const {
|
||||||
|
return begin == other.begin && end == other.end;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_regex_match {
|
||||||
|
common_regex_match_type type = COMMON_REGEX_MATCH_TYPE_NONE;
|
||||||
|
std::vector<common_string_range> groups;
|
||||||
|
|
||||||
|
bool operator==(const common_regex_match & other) const {
|
||||||
|
return type == other.type && groups == other.groups;
|
||||||
|
}
|
||||||
|
bool operator!=(const common_regex_match & other) const {
|
||||||
|
return !(*this == other);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class common_regex {
|
||||||
|
std::string pattern;
|
||||||
|
std::regex rx;
|
||||||
|
std::regex rx_reversed_partial;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit common_regex(const std::string & pattern);
|
||||||
|
|
||||||
|
common_regex_match search(const std::string & input, size_t pos, bool as_match = false) const;
|
||||||
|
|
||||||
|
const std::string & str() const { return pattern; }
|
||||||
|
};
|
||||||
|
|
||||||
|
// For testing only (pretty print of failures).
|
||||||
|
std::string regex_to_reversed_partial_regex(const std::string & pattern);
|
@ -1,6 +1,7 @@
|
|||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
@ -160,7 +161,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
||||||
#endif // LLAMA_USE_LLGUIDANCE
|
#endif // LLAMA_USE_LLGUIDANCE
|
||||||
} else {
|
} else {
|
||||||
std::vector<std::string> patterns_at_start;
|
std::vector<std::string> trigger_patterns;
|
||||||
std::vector<std::string> patterns_anywhere;
|
std::vector<std::string> patterns_anywhere;
|
||||||
std::vector<llama_token> trigger_tokens;
|
std::vector<llama_token> trigger_tokens;
|
||||||
for (const auto & trigger : params.grammar_triggers) {
|
for (const auto & trigger : params.grammar_triggers) {
|
||||||
@ -172,10 +173,13 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
||||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
|
|
||||||
{
|
{
|
||||||
const auto & pattern = trigger.value;
|
patterns_anywhere.push_back(trigger.value);
|
||||||
(trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
|
break;
|
||||||
|
}
|
||||||
|
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
|
||||||
|
{
|
||||||
|
trigger_patterns.push_back(trigger.value);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
|
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
|
||||||
@ -189,10 +193,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> trigger_patterns;
|
|
||||||
if (!patterns_at_start.empty()) {
|
|
||||||
trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
|
|
||||||
}
|
|
||||||
if (!patterns_anywhere.empty()) {
|
if (!patterns_anywhere.empty()) {
|
||||||
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
|
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
|
||||||
}
|
}
|
||||||
@ -229,51 +229,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||||||
params.logit_bias.data()));
|
params.logit_bias.data()));
|
||||||
|
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
if (params.top_n_sigma >= 0) {
|
for (const auto & cnstr : params.samplers) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
switch (cnstr) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
|
case COMMON_SAMPLER_TYPE_DRY:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
{
|
||||||
} else {
|
std::vector<const char *> c_breakers;
|
||||||
for (const auto & cnstr : params.samplers) {
|
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||||
switch (cnstr) {
|
for (const auto & str : params.dry_sequence_breakers) {
|
||||||
case COMMON_SAMPLER_TYPE_DRY:
|
c_breakers.push_back(str.c_str());
|
||||||
{
|
|
||||||
std::vector<const char *> c_breakers;
|
|
||||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
|
||||||
for (const auto & str : params.dry_sequence_breakers) {
|
|
||||||
c_breakers.push_back(str.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
}
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_XTC:
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
case COMMON_SAMPLER_TYPE_XTC:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_INFILL:
|
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||||
break;
|
break;
|
||||||
default:
|
case COMMON_SAMPLER_TYPE_INFILL:
|
||||||
GGML_ASSERT(false && "unknown sampler type");
|
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||||
}
|
break;
|
||||||
|
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false && "unknown sampler type");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||||
@ -475,6 +472,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
|||||||
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
||||||
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||||
@ -490,6 +488,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
|||||||
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||||
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||||
@ -504,6 +503,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
|||||||
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
||||||
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||||
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||||
@ -517,6 +517,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
|||||||
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
||||||
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||||
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
@ -533,14 +534,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
|||||||
auto sampler = sampler_canonical_name_map.find(name);
|
auto sampler = sampler_canonical_name_map.find(name);
|
||||||
if (sampler != sampler_canonical_name_map.end()) {
|
if (sampler != sampler_canonical_name_map.end()) {
|
||||||
samplers.push_back(sampler->second);
|
samplers.push_back(sampler->second);
|
||||||
} else {
|
continue;
|
||||||
if (allow_alt_names) {
|
}
|
||||||
sampler = sampler_alt_name_map.find(name);
|
if (allow_alt_names) {
|
||||||
if (sampler != sampler_alt_name_map.end()) {
|
sampler = sampler_alt_name_map.find(name);
|
||||||
samplers.push_back(sampler->second);
|
if (sampler != sampler_alt_name_map.end()) {
|
||||||
}
|
samplers.push_back(sampler->second);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
return samplers;
|
return samplers;
|
||||||
@ -552,6 +555,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
|||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||||
@ -566,6 +570,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
|||||||
const auto sampler = sampler_name_map.find(c);
|
const auto sampler = sampler_name_map.find(c);
|
||||||
if (sampler != sampler_name_map.end()) {
|
if (sampler != sampler_name_map.end()) {
|
||||||
samplers.push_back(sampler->second);
|
samplers.push_back(sampler->second);
|
||||||
|
} else {
|
||||||
|
LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft(
|
|||||||
auto & smpl = spec->smpl;
|
auto & smpl = spec->smpl;
|
||||||
auto & prompt = spec->prompt;
|
auto & prompt = spec->prompt;
|
||||||
|
|
||||||
|
auto * mem = llama_get_memory(ctx);
|
||||||
|
|
||||||
int reuse_i = 0;
|
int reuse_i = 0;
|
||||||
int reuse_n = 0;
|
int reuse_n = 0;
|
||||||
|
|
||||||
@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft(
|
|||||||
result.reserve(params.n_draft);
|
result.reserve(params.n_draft);
|
||||||
|
|
||||||
if (reuse_n == 0) {
|
if (reuse_n == 0) {
|
||||||
llama_kv_self_clear(ctx);
|
llama_memory_clear(mem, false);
|
||||||
|
|
||||||
prompt.clear();
|
prompt.clear();
|
||||||
} else {
|
} else {
|
||||||
@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (reuse_i > 0) {
|
if (reuse_i > 0) {
|
||||||
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
|
llama_memory_seq_rm (mem, 0, 0, reuse_i);
|
||||||
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
|
llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
|
||||||
|
|
||||||
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (reuse_n < (int) prompt.size()) {
|
if (reuse_n < (int) prompt.size()) {
|
||||||
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
|
llama_memory_seq_rm (mem, 0, reuse_n, -1);
|
||||||
|
|
||||||
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
||||||
}
|
}
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,28 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# This script downloads the tokenizer models of the specified models from Huggingface and
|
|
||||||
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
|
|
||||||
#
|
|
||||||
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
|
|
||||||
# provide the necessary information to llama.cpp via the GGUF header in order to implement
|
|
||||||
# the same pre-tokenizer.
|
|
||||||
#
|
|
||||||
# ref: https://github.com/ggml-org/llama.cpp/pull/6920
|
|
||||||
#
|
|
||||||
# Instructions:
|
|
||||||
#
|
|
||||||
# - Add a new model to the "models" list
|
|
||||||
# - Run the script with your huggingface token:
|
|
||||||
#
|
|
||||||
# python3 convert_hf_to_gguf_update.py <huggingface_token>
|
|
||||||
#
|
|
||||||
# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
|
|
||||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
|
||||||
#
|
|
||||||
# TODO: generate tokenizer tests for llama.cpp
|
|
||||||
#
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
@ -32,6 +10,7 @@ import requests
|
|||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import shutil
|
import shutil
|
||||||
|
import argparse
|
||||||
|
|
||||||
from hashlib import sha256
|
from hashlib import sha256
|
||||||
from enum import IntEnum, auto
|
from enum import IntEnum, auto
|
||||||
@ -41,6 +20,11 @@ logging.basicConfig(level=logging.DEBUG)
|
|||||||
logger = logging.getLogger("convert_hf_to_gguf_update")
|
logger = logging.getLogger("convert_hf_to_gguf_update")
|
||||||
sess = requests.Session()
|
sess = requests.Session()
|
||||||
|
|
||||||
|
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
|
||||||
|
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
||||||
|
hf_token_pth = pathlib.Path.home() / ".cache" / "huggingface" / "token"
|
||||||
|
hf_token = hf_token_pth.read_text(encoding="utf-8").strip() if hf_token_pth.exists() else None
|
||||||
|
|
||||||
|
|
||||||
class TOKENIZER_TYPE(IntEnum):
|
class TOKENIZER_TYPE(IntEnum):
|
||||||
SPM = auto()
|
SPM = auto()
|
||||||
@ -49,20 +33,49 @@ class TOKENIZER_TYPE(IntEnum):
|
|||||||
UGM = auto()
|
UGM = auto()
|
||||||
|
|
||||||
|
|
||||||
|
DOC_STRING = """
|
||||||
|
This script downloads the tokenizer models of the specified models from Huggingface and
|
||||||
|
generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
|
||||||
|
|
||||||
|
/!\\ It is intended to be used by contributors and is not meant to be run by end users
|
||||||
|
|
||||||
|
This is necessary in order to analyze the type of pre-tokenizer used by the model and
|
||||||
|
provide the necessary information to llama.cpp via the GGUF header in order to implement
|
||||||
|
the same pre-tokenizer.
|
||||||
|
|
||||||
|
ref: https://github.com/ggml-org/llama.cpp/pull/6920
|
||||||
|
|
||||||
|
Instructions:
|
||||||
|
|
||||||
|
- Add a new model to the "models" list
|
||||||
|
- Run the script with your huggingface token
|
||||||
|
By default, token will be read from ~/.cache/huggingface/token
|
||||||
|
- The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
|
||||||
|
- Update llama.cpp with the new pre-tokenizer if necessary
|
||||||
|
"""
|
||||||
|
# TODO: generate tokenizer tests for llama.cpp
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description=DOC_STRING, formatter_class=argparse.RawTextHelpFormatter)
|
||||||
|
parser.add_argument(
|
||||||
|
"--full", action="store_true",
|
||||||
|
help="download full list of models - make sure you have access to all of them",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"hf_token",
|
||||||
|
help="optional HF token",
|
||||||
|
nargs="?",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
hf_token = args.hf_token if args.hf_token is not None else hf_token
|
||||||
|
|
||||||
|
if hf_token is None:
|
||||||
|
logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||||
# will be updated with time - contributions welcome
|
# will be updated with time - contributions welcome
|
||||||
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||||
|
|
||||||
if len(sys.argv) == 2:
|
|
||||||
token = sys.argv[1]
|
|
||||||
if not token.startswith("hf_"):
|
|
||||||
logger.info("Huggingface token seems invalid")
|
|
||||||
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
|
|
||||||
sys.exit(1)
|
|
||||||
else:
|
|
||||||
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# TODO: add models here, base models preferred
|
# TODO: add models here, base models preferred
|
||||||
models = [
|
models = [
|
||||||
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
||||||
@ -103,7 +116,6 @@ models = [
|
|||||||
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
||||||
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
||||||
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
|
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
|
||||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
|
|
||||||
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
||||||
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
||||||
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
||||||
@ -114,8 +126,17 @@ models = [
|
|||||||
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
|
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
|
||||||
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
|
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
|
||||||
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
|
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
|
||||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
|
|
||||||
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
|
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
|
||||||
|
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
|
||||||
|
]
|
||||||
|
|
||||||
|
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||||
|
pre_computed_hashes = [
|
||||||
|
# chatglm-bpe has 2 hashes, why?
|
||||||
|
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
|
||||||
|
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
|
||||||
|
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
|
||||||
|
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@ -168,9 +189,29 @@ def download_model(model):
|
|||||||
if os.path.isfile(save_path):
|
if os.path.isfile(save_path):
|
||||||
logger.info(f"{name}: File {save_path} already exists - skipping")
|
logger.info(f"{name}: File {save_path} already exists - skipping")
|
||||||
continue
|
continue
|
||||||
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
download_file_with_auth(f"{repo}/resolve/main/{file}", hf_token, save_path)
|
||||||
|
|
||||||
|
|
||||||
|
# get list of existing models and chkhsh from the convert_hf_to_gguf.py file
|
||||||
|
# returns mapping res --> chkhsh
|
||||||
|
def get_existing_models(convert_py):
|
||||||
|
pattern = r'if chkhsh == "([a-f0-9]{64})":\s*\n\s*.*\s*res = "([^"]+)"'
|
||||||
|
matches = re.findall(pattern, convert_py)
|
||||||
|
output = {}
|
||||||
|
for chkhsh, res in matches:
|
||||||
|
output[res] = chkhsh
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
existing_models = {}
|
||||||
|
all_models = models.copy()
|
||||||
|
if not args.full:
|
||||||
|
# Filter out models that already exist in convert_hf_to_gguf.py
|
||||||
|
existing_models = get_existing_models(convert_py)
|
||||||
|
all_models = models.copy()
|
||||||
|
models = [model for model in all_models if model["name"] not in existing_models]
|
||||||
|
|
||||||
|
logging.info(f"Downloading {len(models)} models...")
|
||||||
for model in models:
|
for model in models:
|
||||||
try:
|
try:
|
||||||
download_model(model)
|
download_model(model)
|
||||||
@ -181,9 +222,10 @@ for model in models:
|
|||||||
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
||||||
|
|
||||||
src_ifs = ""
|
src_ifs = ""
|
||||||
for model in models:
|
for model in [*all_models, *pre_computed_hashes]:
|
||||||
name = model["name"]
|
name = model["name"]
|
||||||
tokt = model["tokt"]
|
tokt = model["tokt"]
|
||||||
|
chkhsh = model.get("chkhsh")
|
||||||
|
|
||||||
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
|
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
|
||||||
continue
|
continue
|
||||||
@ -194,35 +236,44 @@ for model in models:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# create the tokenizer
|
# create the tokenizer
|
||||||
try:
|
if chkhsh is not None:
|
||||||
if name == "t5":
|
# if the model has a pre-computed hash, use it
|
||||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
|
logger.info(f"Using pre-computed hash for model {name}: {chkhsh}")
|
||||||
else:
|
elif name in existing_models:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
# if the model already exists in convert_hf_to_gguf.py, skip compute hash
|
||||||
except OSError as e:
|
chkhsh = existing_models[name]
|
||||||
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
else:
|
||||||
continue # Skip to the next model if the tokenizer can't be loaded
|
# otherwise, compute the hash of the tokenizer
|
||||||
|
try:
|
||||||
|
logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
|
||||||
|
if name == "t5":
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
|
||||||
|
else:
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||||
|
except OSError as e:
|
||||||
|
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
||||||
|
continue # Skip to the next model if the tokenizer can't be loaded
|
||||||
|
|
||||||
chktok = tokenizer.encode(CHK_TXT)
|
chktok = tokenizer.encode(CHK_TXT)
|
||||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
|
|
||||||
logger.info(f"model: {name}")
|
logger.info(f"model: {name}")
|
||||||
logger.info(f"tokt: {tokt}")
|
logger.info(f"tokt: {tokt}")
|
||||||
logger.info(f"repo: {model['repo']}")
|
logger.info(f"repo: {model['repo']}")
|
||||||
logger.info(f"chktok: {chktok}")
|
logger.info(f"chktok: {chktok}")
|
||||||
logger.info(f"chkhsh: {chkhsh}")
|
logger.info(f"chkhsh: {chkhsh}")
|
||||||
|
|
||||||
# print the "pre_tokenizer" content from the tokenizer.json
|
# print the "pre_tokenizer" content from the tokenizer.json
|
||||||
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
|
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
|
||||||
cfg = json.load(f)
|
cfg = json.load(f)
|
||||||
normalizer = cfg["normalizer"]
|
normalizer = cfg["normalizer"]
|
||||||
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
|
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
|
||||||
pre_tokenizer = cfg["pre_tokenizer"]
|
pre_tokenizer = cfg["pre_tokenizer"]
|
||||||
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
||||||
if "ignore_merges" in cfg["model"]:
|
if "ignore_merges" in cfg["model"]:
|
||||||
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
|
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
|
||||||
|
|
||||||
logger.info("")
|
logger.info("")
|
||||||
|
|
||||||
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
|
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
|
||||||
src_ifs += f" # ref: {model['repo']}\n"
|
src_ifs += f" # ref: {model['repo']}\n"
|
||||||
@ -270,8 +321,6 @@ src_func = f"""
|
|||||||
return res
|
return res
|
||||||
"""
|
"""
|
||||||
|
|
||||||
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
|
|
||||||
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
|
||||||
convert_py = re.sub(
|
convert_py = re.sub(
|
||||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||||
lambda m: m.group(1) + src_func + m.group(3),
|
lambda m: m.group(1) + src_func + m.group(3),
|
||||||
@ -287,7 +336,7 @@ logger.info("+++ convert_hf_to_gguf.py was updated")
|
|||||||
|
|
||||||
tests = [
|
tests = [
|
||||||
"ied 4 ½ months",
|
"ied 4 ½ months",
|
||||||
"Führer",
|
"Äpfel",
|
||||||
"",
|
"",
|
||||||
" ",
|
" ",
|
||||||
" ",
|
" ",
|
||||||
@ -366,6 +415,10 @@ for model in models:
|
|||||||
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
|
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
|
||||||
continue # Skip this model and continue with the next one in the loop
|
continue # Skip this model and continue with the next one in the loop
|
||||||
|
|
||||||
|
if not os.path.exists(f"models/ggml-vocab-{name}.gguf"):
|
||||||
|
logger.info(f"Skip vocab files for model {name}, no GGUF file found")
|
||||||
|
continue
|
||||||
|
|
||||||
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
|
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
|
||||||
for text in tests:
|
for text in tests:
|
||||||
f.write(f"{text}")
|
f.write(f"{text}")
|
||||||
|
155
docs/backend/CANN.md
Normal file → Executable file
155
docs/backend/CANN.md
Normal file → Executable file
@ -8,6 +8,7 @@
|
|||||||
- [DataType Supports](#datatype-supports)
|
- [DataType Supports](#datatype-supports)
|
||||||
- [Docker](#docker)
|
- [Docker](#docker)
|
||||||
- [Linux](#linux)
|
- [Linux](#linux)
|
||||||
|
- [Environment variable setup](#environment-variable-setup)
|
||||||
- [TODO](#todo)
|
- [TODO](#todo)
|
||||||
|
|
||||||
|
|
||||||
@ -56,60 +57,82 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
|||||||
|
|
||||||
## Model Supports
|
## Model Supports
|
||||||
|
|
||||||
| Model Name | FP16 | Q8_0 | Q4_0 |
|
| Model Name | FP16 | Q4_0 | Q8_0 |
|
||||||
|:----------------------------|:-----:|:----:|:----:|
|
|:----------------------------|:-----:|:----:|:----:|
|
||||||
| AquilaChat2-7B | √ | √ | √ |
|
| Llama-2 | √ | √ | √ |
|
||||||
| Baichuan-7b | √ | √ | √ |
|
| Llama-3 | √ | √ | √ |
|
||||||
| Baichuan2-7B-Chat | √ | √ | √ |
|
| Mistral-7B | √ | √ | √ |
|
||||||
| bitnet_b1_58-large | √ | √ | √ |
|
| Mistral MOE | √ | √ | √ |
|
||||||
| bloom-560m | √ | x | √ |
|
| DBRX | - | - | - |
|
||||||
| bloomz-alpaca-560m | √ | x | √ |
|
| Falcon | √ | √ | √ |
|
||||||
| c4ai-command-r-35B-v01 | x | x | x |
|
| Chinese LLaMA/Alpaca | √ | √ | √ |
|
||||||
| chatglm3-6B | x | x | x |
|
| Vigogne(French) | √ | √ | √ |
|
||||||
| chinese-alpaca-2-1.3b | √ | √ | √ |
|
| BERT | x | x | x |
|
||||||
| CodeShell-7B | √ | √ | √ |
|
| Koala | √ | √ | √ |
|
||||||
| deepseek-ai_deepseek-coder-1.3B-base | x | x | x |
|
| Baichuan | √ | √ | √ |
|
||||||
| deepseek-ai_DeepSeek-V2-Lite | x | x | x |
|
| Aquila 1 & 2 | √ | √ | √ |
|
||||||
| deepseek-coder-6.7B-instruct | x | x | x |
|
| Starcoder models | √ | √ | √ |
|
||||||
| DeepSeek-V2-Lite-64x1.5B | x | x | x |
|
| Refact | √ | √ | √ |
|
||||||
| falcon-7b-instruct | √ | √ | √ |
|
| MPT | √ | √ | √ |
|
||||||
| flan-t5-large | √ | √ | √ |
|
| Bloom | √ | √ | √ |
|
||||||
| gemma-2-9b-it | √ | √ | √ |
|
| Yi models | √ | √ | √ |
|
||||||
| glm-4-9B | x | x | x |
|
| stablelm models | √ | √ | √ |
|
||||||
| gpt2 | √ | √ | √ |
|
| DeepSeek models | x | x | x |
|
||||||
| Gpt2-163M | √ | √ | √ |
|
| Qwen models | √ | √ | √ |
|
||||||
| granite-3B-code-instruct | √ | √ | √ |
|
| PLaMo-13B | √ | √ | √ |
|
||||||
|
| Phi models | √ | √ | √ |
|
||||||
|
| PhiMoE | √ | √ | √ |
|
||||||
|
| GPT-2 | √ | √ | √ |
|
||||||
|
| Orion | √ | √ | √ |
|
||||||
|
| InternlLM2 | √ | √ | √ |
|
||||||
|
| CodeShell | √ | √ | √ |
|
||||||
|
| Gemma | √ | √ | √ |
|
||||||
|
| Mamba | √ | √ | √ |
|
||||||
|
| Xverse | √ | √ | √ |
|
||||||
|
| command-r models | √ | √ | √ |
|
||||||
|
| Grok-1 | - | - | - |
|
||||||
|
| SEA-LION | √ | √ | √ |
|
||||||
| GritLM-7B | √ | √ | √ |
|
| GritLM-7B | √ | √ | √ |
|
||||||
| internlm2_5-7b-chat | √ | √ | √ |
|
| OLMo | √ | √ | √ |
|
||||||
| koala-7B-HF | √ | √ | √ |
|
| OLMo 2 | √ | √ | √ |
|
||||||
| Llama-2-7b-chat-hf | √ | √ | √ |
|
| OLMoE | √ | √ | √ |
|
||||||
| Llama-3-Smaug-8B | √ | √ | √ |
|
| Granite models | √ | √ | √ |
|
||||||
| Llama2-Chinese-7b-Chat | √ | √ | √ |
|
| GPT-NeoX | √ | √ | √ |
|
||||||
| Llama3-8B | √ | √ | √ |
|
| Pythia | √ | √ | √ |
|
||||||
| Llama3-8b-chinese | √ | √ | √ |
|
| Snowflake-Arctic MoE | - | - | - |
|
||||||
| mamba-130m-hf | √ | √ | √ |
|
| Smaug | √ | √ | √ |
|
||||||
| Mistral-7B-Instruct-v0.2 | √ | √ | √ |
|
| Poro 34B | √ | √ | √ |
|
||||||
| Mixtral-8x7B-Instruct-v0.1 | x | √ | √ |
|
| Bitnet b1.58 models | √ | x | x |
|
||||||
| mpt-7B | √ | √ | √ |
|
| Flan-T5 | √ | √ | √ |
|
||||||
| OLMo-1B-hf | √ | √ | √ |
|
| Open Elm models | x | √ | √ |
|
||||||
| OpenELM-3B-Instruct | √ | √ | √ |
|
| chatGLM3-6B + ChatGLM4-9b + GLMEdge-1.5b + GLMEdge-4b | √ | √ | √ |
|
||||||
| Orion-14b-base | √ | √ | √ |
|
| GLM-4-0414 | √ | √ | √ |
|
||||||
| phi1 | x | x | x |
|
| SmolLM | √ | √ | √ |
|
||||||
| phi2 | x | x | x |
|
| EXAONE-3.0-7.8B-Instruct | √ | √ | √ |
|
||||||
| Phi-3-mini-4k-instruct | √ | √ | √ |
|
| FalconMamba Models | √ | √ | √ |
|
||||||
| plamo-13b | √ | √ | √ |
|
| Jais Models | - | x | x |
|
||||||
| pythia-70M | x | x | x |
|
| Bielik-11B-v2.3 | √ | √ | √ |
|
||||||
| Qwen-7B | √ | √ | √ |
|
| RWKV-6 | - | √ | √ |
|
||||||
| Qwen2-1.5B-Instruct | √ | x | √ |
|
| QRWKV-6 | √ | √ | √ |
|
||||||
| Refact-1_6B-fim | √ | √ | √ |
|
| GigaChat-20B-A3B | x | x | x |
|
||||||
| SmolLM-135M | √ | √ | √ |
|
| Trillion-7B-preview | √ | √ | √ |
|
||||||
| stablelm-zephyr | x | x | x |
|
| Ling models | √ | √ | √ |
|
||||||
| stablelm-2-zephyr-1_6b | x | x | x |
|
|
||||||
| starcoderbase-1b | √ | √ | √ |
|
|
||||||
| starcoder2-3b | √ | √ | √ |
|
**Multimodal**
|
||||||
| vigogne-7b-chat | √ | √ | √ |
|
| Model Name | FP16 | Q4_0 | Q8_0 |
|
||||||
| xverse-7b-chat | √ | √ | √ |
|
|:----------------------------|:-----:|:----:|:----:|
|
||||||
| Yi-6b-Chat | √ | √ | √ |
|
| LLaVA 1.5 models, LLaVA 1.6 models | x | x | x |
|
||||||
|
| BakLLaVA | √ | √ | √ |
|
||||||
|
| Obsidian | √ | - | - |
|
||||||
|
| ShareGPT4V | x | - | - |
|
||||||
|
| MobileVLM 1.7B/3B models | - | - | - |
|
||||||
|
| Yi-VL | - | - | - |
|
||||||
|
| Mini CPM | √ | √ | √ |
|
||||||
|
| Moondream | √ | √ | √ |
|
||||||
|
| Bunny | √ | - | - |
|
||||||
|
| GLM-EDGE | √ | √ | √ |
|
||||||
|
| Qwen2-VL | √ | √ | √ |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -258,6 +281,34 @@ cmake --build build --config release
|
|||||||
### **GitHub contribution**:
|
### **GitHub contribution**:
|
||||||
Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
|
Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
|
||||||
|
|
||||||
|
## Updates
|
||||||
|
### Basic Flash Attention Support
|
||||||
|
The basic FA kernel with aclnnops has been added in aclnn_ops.cpp.
|
||||||
|
Currently, the FA only supports the cases with FP16 KV tensors and NO logit softcap.
|
||||||
|
Since the aclnn interface for flash attention cannot support the logit softcap, we will only update the quantized version in the future.
|
||||||
|
|
||||||
|
Authors from Peking University: Bizhao Shi (bshi@pku.edu.cn), Yuxin Yang (yxyang@pku.edu.cn), Ruiyang Ma (ruiyang@stu.pku.edu.cn), and Guojie Luo (gluo@pku.edu.cn).
|
||||||
|
|
||||||
|
We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request.
|
||||||
|
|
||||||
|
## Environment variable setup
|
||||||
|
|
||||||
|
### GGML_CANN_ASYNC_MODE
|
||||||
|
|
||||||
|
Enables asynchronous operator submission. Disabled by default.
|
||||||
|
|
||||||
|
### GGML_CANN_MEM_POOL
|
||||||
|
|
||||||
|
Specifies the memory pool management strategy:
|
||||||
|
|
||||||
|
- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
|
||||||
|
|
||||||
|
- prio: Employs a priority queue-based memory pool management.
|
||||||
|
- leg: Uses a fixed-size buffer pool.
|
||||||
|
|
||||||
|
### GGML_CANN_DISABLE_BUF_POOL_CLEAN
|
||||||
|
|
||||||
|
Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
- Support more models and data types.
|
- Support more models and data types.
|
||||||
|
@ -17,25 +17,25 @@
|
|||||||
|
|
||||||
**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.
|
**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.
|
||||||
|
|
||||||
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
|
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to Intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
|
||||||
|
|
||||||
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
|
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
|
||||||
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
|
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
|
||||||
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
|
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over Intel iGPUs and dGPUs.
|
||||||
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
||||||
|
|
||||||
### Llama.cpp + SYCL
|
### Llama.cpp + SYCL
|
||||||
|
|
||||||
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
|
The llama.cpp SYCL backend is primarily designed for **Intel GPUs**.
|
||||||
|
SYCL cross-platform capabilities enable support for Nvidia GPUs as well, with limited support for AMD.
|
||||||
|
|
||||||
## Recommended Release
|
## Recommended Release
|
||||||
|
|
||||||
The SYCL backend would be broken by some PRs due to no online CI.
|
The following releases are verified and recommended:
|
||||||
|
|
||||||
The following release is verified with good quality:
|
|
||||||
|
|
||||||
|Commit ID|Tag|Release|Verified Platform| Update date|
|
|Commit ID|Tag|Release|Verified Platform| Update date|
|
||||||
|-|-|-|-|-|
|
|-|-|-|-|-|
|
||||||
|
|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |ArcB580/Linux/oneAPI 2025.1<br>LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15|
|
||||||
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|
||||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
|
||||||
|
|
||||||
@ -106,15 +106,14 @@ SYCL backend supports Intel GPU Family:
|
|||||||
|-------------------------------|---------|---------------------------------------|
|
|-------------------------------|---------|---------------------------------------|
|
||||||
| Intel Data Center Max Series | Support | Max 1550, 1100 |
|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
|
||||||
| Intel Data Center Flex Series | Support | Flex 170 |
|
| Intel Data Center Flex Series | Support | Flex 170 |
|
||||||
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
|
| Intel Arc Series | Support | Arc 770, 730M, Arc A750, B580 |
|
||||||
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake |
|
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake |
|
||||||
| Intel iGPU | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 |
|
| Intel iGPU | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7 |
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- **Memory**
|
- **Memory**
|
||||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
|
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
|
||||||
|
|
||||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||||
|
|
||||||
- **Execution Unit (EU)**
|
- **Execution Unit (EU)**
|
||||||
@ -138,9 +137,11 @@ Note: AMD GPU support is highly experimental and is incompatible with F16.
|
|||||||
Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.
|
Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.
|
||||||
|
|
||||||
## Docker
|
## Docker
|
||||||
The docker build option is currently limited to *intel GPU* targets.
|
|
||||||
|
The docker build option is currently limited to *Intel GPU* targets.
|
||||||
|
|
||||||
### Build image
|
### Build image
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Using FP16
|
# Using FP16
|
||||||
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
|
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
|
||||||
@ -148,9 +149,10 @@ docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f
|
|||||||
|
|
||||||
*Notes*:
|
*Notes*:
|
||||||
|
|
||||||
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
|
To build in default FP32 *(Slower than FP16 alternative)*, set `--build-arg="GGML_SYCL_F16=OFF"` in the previous command.
|
||||||
|
|
||||||
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||||
|
Check the [documentation for Docker](../docker.md) to see the available images.
|
||||||
|
|
||||||
### Run container
|
### Run container
|
||||||
|
|
||||||
@ -250,7 +252,7 @@ sycl-ls
|
|||||||
|
|
||||||
- **Intel GPU**
|
- **Intel GPU**
|
||||||
|
|
||||||
When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below:
|
When targeting an intel GPU, the user should expect one or more devices among the available SYCL devices. Please make sure that at least one GPU is present via `sycl-ls`, for instance `[level_zero:gpu]` in the sample output below:
|
||||||
|
|
||||||
```
|
```
|
||||||
[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||||
@ -282,7 +284,7 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
|
|||||||
|
|
||||||
#### Intel GPU
|
#### Intel GPU
|
||||||
|
|
||||||
```
|
```sh
|
||||||
./examples/sycl/build.sh
|
./examples/sycl/build.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -351,7 +353,7 @@ cmake --build build --config Release -j -v
|
|||||||
|
|
||||||
#### Retrieve and prepare model
|
#### Retrieve and prepare model
|
||||||
|
|
||||||
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
|
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
|
||||||
|
|
||||||
##### Check device
|
##### Check device
|
||||||
|
|
||||||
@ -398,11 +400,15 @@ Choose one of following methods to run.
|
|||||||
|
|
||||||
```sh
|
```sh
|
||||||
./examples/sycl/run-llama2.sh 0
|
./examples/sycl/run-llama2.sh 0
|
||||||
|
# OR
|
||||||
|
./examples/sycl/run-llama3.sh 0
|
||||||
```
|
```
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./examples/sycl/run-llama2.sh
|
./examples/sycl/run-llama2.sh
|
||||||
|
# OR
|
||||||
|
./examples/sycl/run-llama3.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Command line
|
2. Command line
|
||||||
@ -425,13 +431,13 @@ Examples:
|
|||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0
|
||||||
```
|
```
|
||||||
|
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer
|
||||||
```
|
```
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
@ -452,7 +458,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||||||
|
|
||||||
1. Install GPU driver
|
1. Install GPU driver
|
||||||
|
|
||||||
Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
|
Intel GPU drivers instructions guide and download page can be found here: [Get Intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
|
||||||
|
|
||||||
2. Install Visual Studio
|
2. Install Visual Studio
|
||||||
|
|
||||||
@ -629,7 +635,7 @@ Once it is completed, final results will be in **build/Release/bin**
|
|||||||
|
|
||||||
#### Retrieve and prepare model
|
#### Retrieve and prepare model
|
||||||
|
|
||||||
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
|
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
|
||||||
|
|
||||||
##### Check device
|
##### Check device
|
||||||
|
|
||||||
@ -648,7 +654,7 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
|
|||||||
build\bin\llama-ls-sycl-device.exe
|
build\bin\llama-ls-sycl-device.exe
|
||||||
```
|
```
|
||||||
|
|
||||||
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *Intel GPU* it would look like the following:
|
||||||
```
|
```
|
||||||
found 2 SYCL devices:
|
found 2 SYCL devices:
|
||||||
| | | |Compute |Max compute|Max work|Max sub| |
|
| | | |Compute |Max compute|Max work|Max sub| |
|
||||||
@ -658,13 +664,14 @@ found 2 SYCL devices:
|
|||||||
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Choose level-zero devices
|
#### Choose level-zero devices
|
||||||
|
|
||||||
|Chosen Device ID|Setting|
|
|Chosen Device ID|Setting|
|
||||||
|-|-|
|
|-|-|
|
||||||
|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
|
|0|Default option. You may also want to `set ONEAPI_DEVICE_SELECTOR="level_zero:0"`|
|
||||||
|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|
|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|
||||||
|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
|
|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"` or `set ONEAPI_DEVICE_SELECTOR="level_zero:*"`|
|
||||||
|
|
||||||
#### Execute
|
#### Execute
|
||||||
|
|
||||||
@ -673,7 +680,13 @@ Choose one of following methods to run.
|
|||||||
1. Script
|
1. Script
|
||||||
|
|
||||||
```
|
```
|
||||||
examples\sycl\win-run-llama2.bat
|
examples\sycl\win-run-llama-2.bat
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```
|
||||||
|
examples\sycl\win-run-llama-3.bat
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Command line
|
2. Command line
|
||||||
@ -697,13 +710,13 @@ Examples:
|
|||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0
|
||||||
```
|
```
|
||||||
|
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
@ -714,7 +727,9 @@ Note:
|
|||||||
```sh
|
```sh
|
||||||
detect 1 SYCL GPUs: [0] with top Max compute units:512
|
detect 1 SYCL GPUs: [0] with top Max compute units:512
|
||||||
```
|
```
|
||||||
|
|
||||||
Or
|
Or
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
use 1 SYCL GPUs: [0] with Max compute units:512
|
use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
```
|
```
|
||||||
@ -726,14 +741,17 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||||||
|
|
||||||
| Name | Value | Function |
|
| Name | Value | Function |
|
||||||
|--------------------|---------------------------------------|---------------------------------------------|
|
|--------------------|---------------------------------------|---------------------------------------------|
|
||||||
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. |
|
||||||
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
|
||||||
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
|
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
|
||||||
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. (1.) |
|
||||||
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
|
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
|
||||||
|
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
|
||||||
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
||||||
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||||
|
|
||||||
|
1. FP16 is recommended for better prompt processing performance on quantized models. Performance is equivalent in text generation but set `GGML_SYCL_F16=OFF` if you are experiencing issues with FP16 builds.
|
||||||
|
|
||||||
#### Runtime
|
#### Runtime
|
||||||
|
|
||||||
| Name | Value | Function |
|
| Name | Value | Function |
|
||||||
@ -741,6 +759,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||||||
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
|
||||||
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
|
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
|
||||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
|
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
|
||||||
|
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
||||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||||
|
|
||||||
|
|
||||||
@ -750,7 +769,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||||||
|
|
||||||
## Q&A
|
## Q&A
|
||||||
|
|
||||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
- Error: `error while loading shared libraries: libsycl.so: cannot open shared object file: No such file or directory`.
|
||||||
|
|
||||||
- Potential cause: Unavailable oneAPI installation or not set ENV variables.
|
- Potential cause: Unavailable oneAPI installation or not set ENV variables.
|
||||||
- Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
|
- Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
|
||||||
@ -779,18 +798,18 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||||||
|
|
||||||
It's same for other projects including llama.cpp SYCL backend.
|
It's same for other projects including llama.cpp SYCL backend.
|
||||||
|
|
||||||
- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
|
- `Native API failed. Native API returns: 39 (UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)`, `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 3503030272 Bytes of memory on device`, or `failed to allocate SYCL0 buffer`
|
||||||
|
|
||||||
Device Memory is not enough.
|
You are running out of Device Memory.
|
||||||
|
|
||||||
|Reason|Solution|
|
|Reason|Solution|
|
||||||
|-|-|
|
|-|-|
|
||||||
|Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
|
| The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
|
||||||
|Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|
|
| The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|
|
||||||
|
|
||||||
### **GitHub contribution**:
|
### **GitHub contribution**:
|
||||||
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
|
Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
|
|
||||||
- NA
|
- Review ZES_ENABLE_SYSMAN: https://github.com/intel/compute-runtime/blob/master/programmers-guide/SYSMAN.md#support-and-limitations
|
||||||
|
@ -1,5 +1,9 @@
|
|||||||
# Build llama.cpp locally
|
# Build llama.cpp locally
|
||||||
|
|
||||||
|
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
|
||||||
|
|
||||||
|
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
|
||||||
|
|
||||||
**To get the Code:**
|
**To get the Code:**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -63,6 +67,7 @@ cmake --build build --config Release
|
|||||||
cmake --preset x64-windows-llvm-release
|
cmake --preset x64-windows-llvm-release
|
||||||
cmake --build build-x64-windows-llvm-release
|
cmake --build build-x64-windows-llvm-release
|
||||||
```
|
```
|
||||||
|
- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
|
||||||
|
|
||||||
## BLAS Build
|
## BLAS Build
|
||||||
|
|
||||||
|
@ -9,10 +9,10 @@ Adding a model requires few steps:
|
|||||||
After following these steps, you can open PR.
|
After following these steps, you can open PR.
|
||||||
|
|
||||||
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
||||||
- [main](/examples/main/)
|
- [main](/tools/main/)
|
||||||
- [imatrix](/examples/imatrix/)
|
- [imatrix](/tools/imatrix/)
|
||||||
- [quantize](/examples/quantize/)
|
- [quantize](/tools/quantize/)
|
||||||
- [server](/examples/server/)
|
- [server](/tools/server/)
|
||||||
|
|
||||||
### 1. Convert the model to GGUF
|
### 1. Convert the model to GGUF
|
||||||
|
|
||||||
|
@ -22,6 +22,9 @@ Additionally, there the following images, similar to the above:
|
|||||||
- `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
|
- `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||||
- `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
|
- `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||||
- `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
|
- `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||||
|
- `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
|
||||||
|
- `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
|
||||||
|
- `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)
|
||||||
|
|
||||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
|
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
|
||||||
|
|
||||||
@ -104,7 +107,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
|
|||||||
|
|
||||||
The defaults are:
|
The defaults are:
|
||||||
|
|
||||||
- `MUSA_VERSION` set to `rc3.1.1`
|
- `MUSA_VERSION` set to `rc4.0.1`
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-MUSA images:
|
The resulting images, are essentially the same as the non-MUSA images:
|
||||||
|
|
||||||
|
@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
|
[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
|
||||||
- `llama-server` when started w/ `--jinja` flag
|
- `llama-server` when started w/ `--jinja` flag
|
||||||
- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556)
|
|
||||||
|
|
||||||
## Universal support w/ Native & Generic handlers
|
## Universal support w/ Native & Generic handlers
|
||||||
|
|
||||||
@ -325,36 +324,65 @@ To get the official template from original HuggingFace repos, you can use [scrip
|
|||||||
> [!TIP]
|
> [!TIP]
|
||||||
> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)
|
> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)
|
||||||
|
|
||||||
|
> [!CAUTION]
|
||||||
|
> Beware of extreme KV quantizations (e.g. `-ctk q4_0`), they can substantially degrade the model's tool calling performance.
|
||||||
|
|
||||||
Test in CLI (or with any library / software that can use OpenAI-compatible API backends):
|
Test in CLI (or with any library / software that can use OpenAI-compatible API backends):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl http://localhost:8080/v1/chat/completions -d '{
|
curl http://localhost:8080/v1/chat/completions -d '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
"tools": [
|
"tools": [
|
||||||
{
|
{
|
||||||
"type":"function",
|
"type":"function",
|
||||||
"function":{
|
"function":{
|
||||||
"name":"python",
|
"name":"python",
|
||||||
"description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
|
"description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
|
||||||
"parameters":{
|
"parameters":{
|
||||||
"type":"object",
|
"type":"object",
|
||||||
"properties":{
|
"properties":{
|
||||||
"code":{
|
"code":{
|
||||||
"type":"string",
|
"type":"string",
|
||||||
"description":"The code to run in the ipython interpreter."
|
"description":"The code to run in the ipython interpreter."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required":["code"]
|
||||||
}
|
}
|
||||||
},
|
|
||||||
"required":["code"]
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
],
|
||||||
],
|
"messages": [
|
||||||
"messages": [
|
{
|
||||||
{
|
"role": "user",
|
||||||
"role": "user",
|
"content": "Print a hello world message with python."
|
||||||
"content": "Print a hello world message with python."
|
}
|
||||||
}
|
]
|
||||||
]
|
}'
|
||||||
|
|
||||||
|
|
||||||
|
curl http://localhost:8080/v1/chat/completions -d '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
|
||||||
|
{"role": "user", "content": "What is the weather in Istanbul?"}
|
||||||
|
],
|
||||||
|
"tools": [{
|
||||||
|
"type":"function",
|
||||||
|
"function":{
|
||||||
|
"name":"get_current_weather",
|
||||||
|
"description":"Get the current weather in a given location",
|
||||||
|
"parameters":{
|
||||||
|
"type":"object",
|
||||||
|
"properties":{
|
||||||
|
"location":{
|
||||||
|
"type":"string",
|
||||||
|
"description":"The city and country/state, e.g. `San Francisco, CA`, or `Paris, France`"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required":["location"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}]
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -1,28 +1,42 @@
|
|||||||
# Install pre-built version of llama.cpp
|
# Install pre-built version of llama.cpp
|
||||||
|
|
||||||
## Homebrew
|
| Install via | Windows | Mac | Linux |
|
||||||
|
|-------------|---------|-----|-------|
|
||||||
|
| Winget | ✅ | | |
|
||||||
|
| Homebrew | | ✅ | ✅ |
|
||||||
|
| MacPorts | | ✅ | |
|
||||||
|
| Nix | | ✅ | ✅ |
|
||||||
|
|
||||||
On Mac and Linux, the homebrew package manager can be used via
|
## Winget (Windows)
|
||||||
|
|
||||||
|
```sh
|
||||||
|
winget install llama.cpp
|
||||||
|
```
|
||||||
|
|
||||||
|
The package is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/issues/8188
|
||||||
|
|
||||||
|
## Homebrew (Mac and Linux)
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
brew install llama.cpp
|
brew install llama.cpp
|
||||||
```
|
```
|
||||||
|
|
||||||
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
|
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
|
||||||
|
|
||||||
## MacPorts
|
## MacPorts (Mac)
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
sudo port install llama.cpp
|
sudo port install llama.cpp
|
||||||
```
|
```
|
||||||
see also: https://ports.macports.org/port/llama.cpp/details/
|
|
||||||
|
|
||||||
## Nix
|
See also: https://ports.macports.org/port/llama.cpp/details/
|
||||||
|
|
||||||
On Mac and Linux, the Nix package manager can be used via
|
## Nix (Mac and Linux)
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
nix profile install nixpkgs#llama-cpp
|
nix profile install nixpkgs#llama-cpp
|
||||||
```
|
```
|
||||||
|
|
||||||
For flake enabled installs.
|
For flake enabled installs.
|
||||||
|
|
||||||
Or
|
Or
|
||||||
@ -34,13 +48,3 @@ nix-env --file '<nixpkgs>' --install --attr llama-cpp
|
|||||||
For non-flake enabled installs.
|
For non-flake enabled installs.
|
||||||
|
|
||||||
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
|
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
|
||||||
|
|
||||||
## Flox
|
|
||||||
|
|
||||||
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
|
|
||||||
|
|
||||||
```sh
|
|
||||||
flox install llama-cpp
|
|
||||||
```
|
|
||||||
|
|
||||||
Flox follows the nixpkgs build of llama.cpp.
|
|
||||||
|
109
docs/multimodal.md
Normal file
109
docs/multimodal.md
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
# Multimodal
|
||||||
|
|
||||||
|
llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature:
|
||||||
|
- [llama-mtmd-cli](../tools/mtmd/README.md)
|
||||||
|
- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
|
||||||
|
|
||||||
|
Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
|
||||||
|
|
||||||
|
To enable it, you can use one of the 2 methods below:
|
||||||
|
|
||||||
|
- Use `-hf` option with a supported model (see a list of pre-quantized model below)
|
||||||
|
- To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
|
||||||
|
- To load a model using `-hf` while using a custom mmproj file, use `--mmproj local_file.gguf`
|
||||||
|
- Use `-m model.gguf` option with `--mmproj file.gguf` to specify text and multimodal projector respectively
|
||||||
|
|
||||||
|
By default, multimodal projector will be offloaded to GPU. To disable this, add `--no-mmproj-offload`
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# simple usage with CLI
|
||||||
|
llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
|
||||||
|
|
||||||
|
# simple usage with server
|
||||||
|
llama-server -hf ggml-org/gemma-3-4b-it-GGUF
|
||||||
|
|
||||||
|
# using local file
|
||||||
|
llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.gguf
|
||||||
|
|
||||||
|
# no GPU offload
|
||||||
|
llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
|
||||||
|
```
|
||||||
|
|
||||||
|
## Pre-quantized models
|
||||||
|
|
||||||
|
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc
|
||||||
|
|
||||||
|
Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
|
||||||
|
|
||||||
|
NOTE: some models may require large context window, for example: `-c 8192`
|
||||||
|
|
||||||
|
**Vision models**:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Gemma 3
|
||||||
|
(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
|
||||||
|
(tool_name) -hf ggml-org/gemma-3-12b-it-GGUF
|
||||||
|
(tool_name) -hf ggml-org/gemma-3-27b-it-GGUF
|
||||||
|
|
||||||
|
# SmolVLM
|
||||||
|
(tool_name) -hf ggml-org/SmolVLM-Instruct-GGUF
|
||||||
|
(tool_name) -hf ggml-org/SmolVLM-256M-Instruct-GGUF
|
||||||
|
(tool_name) -hf ggml-org/SmolVLM-500M-Instruct-GGUF
|
||||||
|
(tool_name) -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
|
||||||
|
(tool_name) -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
|
||||||
|
(tool_name) -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
|
||||||
|
|
||||||
|
# Pixtral 12B
|
||||||
|
(tool_name) -hf ggml-org/pixtral-12b-GGUF
|
||||||
|
|
||||||
|
# Qwen 2 VL
|
||||||
|
(tool_name) -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF
|
||||||
|
(tool_name) -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF
|
||||||
|
|
||||||
|
# Qwen 2.5 VL
|
||||||
|
(tool_name) -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF
|
||||||
|
(tool_name) -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF
|
||||||
|
(tool_name) -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF
|
||||||
|
(tool_name) -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF
|
||||||
|
|
||||||
|
# Mistral Small 3.1 24B (IQ2_M quantization)
|
||||||
|
(tool_name) -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF
|
||||||
|
|
||||||
|
# InternVL 2.5 and 3
|
||||||
|
(tool_name) -hf ggml-org/InternVL2_5-1B-GGUF
|
||||||
|
(tool_name) -hf ggml-org/InternVL2_5-4B-GGUF
|
||||||
|
(tool_name) -hf ggml-org/InternVL3-1B-Instruct-GGUF
|
||||||
|
(tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
|
||||||
|
(tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
|
||||||
|
(tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF
|
||||||
|
|
||||||
|
# Llama 4 Scout
|
||||||
|
(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
|
||||||
|
|
||||||
|
# Moondream2 20250414 version
|
||||||
|
(tool_name) -hf ggml-org/moondream2-20250414-GGUF
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
**Audio models**:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Ultravox 0.5
|
||||||
|
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
|
||||||
|
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
|
||||||
|
|
||||||
|
# Qwen2-Audio and SeaLLM-Audio
|
||||||
|
# note: no pre-quantized GGUF this model, as they have very poor result
|
||||||
|
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
|
||||||
|
```
|
||||||
|
|
||||||
|
**Mixed modalities**:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Qwen2.5 Omni
|
||||||
|
# Capabilities: audio input, vision input
|
||||||
|
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
|
||||||
|
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
|
||||||
|
```
|
@ -33,13 +33,13 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
|||||||
2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
|
python ./tools/mtmd/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
python ./tools/mtmd/convert_image_encoder_to_gguf.py \
|
||||||
-m path/to/clip-vit-large-patch14-336 \
|
-m path/to/clip-vit-large-patch14-336 \
|
||||||
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
||||||
--output-dir path/to/MobileVLM-1.7B \
|
--output-dir path/to/MobileVLM-1.7B \
|
||||||
@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \
|
|||||||
```
|
```
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
python ./tools/mtmd/convert_image_encoder_to_gguf.py \
|
||||||
-m path/to/clip-vit-large-patch14-336 \
|
-m path/to/clip-vit-large-patch14-336 \
|
||||||
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
||||||
--output-dir path/to/MobileVLM-1.7B_V2 \
|
--output-dir path/to/MobileVLM-1.7B_V2 \
|
||||||
@ -69,10 +69,10 @@ Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directo
|
|||||||
|
|
||||||
## Android compile and run
|
## Android compile and run
|
||||||
### compile
|
### compile
|
||||||
refer to `examples/llava/android/build_64.sh`
|
refer to `tools/mtmd/android/build_64.sh`
|
||||||
```sh
|
```sh
|
||||||
mkdir examples/llava/android/build_64
|
mkdir tools/mtmd/android/build_64
|
||||||
cd examples/llava/android/build_64
|
cd tools/mtmd/android/build_64
|
||||||
../build_64.sh
|
../build_64.sh
|
||||||
```
|
```
|
||||||
### run on Android
|
### run on Android
|
||||||
|
@ -25,13 +25,13 @@ git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/T
|
|||||||
2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
|
2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/glmedge-surgery.py -m ../model_path
|
python ./tools/mtmd/glmedge-surgery.py -m ../model_path
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
|
4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
|
python ./tools/mtmd/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
|
||||||
```
|
```
|
||||||
|
|
||||||
5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
|
5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
|
||||||
|
@ -37,19 +37,19 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
|||||||
2. Install the required Python packages:
|
2. Install the required Python packages:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
pip install -r examples/llava/requirements.txt
|
pip install -r tools/mtmd/requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
|
python ./tools/mtmd/llava_surgery.py -m ../llava-v1.5-7b
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
|
4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
python ./tools/mtmd/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||||
```
|
```
|
||||||
|
|
||||||
5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||||
@ -69,12 +69,12 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
|
|||||||
2) Install the required Python packages:
|
2) Install the required Python packages:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
pip install -r examples/llava/requirements.txt
|
pip install -r tools/mtmd/requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||||
```console
|
```console
|
||||||
python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
|
python tools/mtmd/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||||
```
|
```
|
||||||
- you will find a llava.projector and a llava.clip file in your model directory
|
- you will find a llava.projector and a llava.clip file in your model directory
|
||||||
|
|
||||||
@ -88,7 +88,7 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso
|
|||||||
|
|
||||||
5) Create the visual gguf model:
|
5) Create the visual gguf model:
|
||||||
```console
|
```console
|
||||||
python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
python ./tools/mtmd/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||||
```
|
```
|
||||||
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
||||||
|
|
||||||
|
@ -29,8 +29,8 @@ cmake --build build --config Release
|
|||||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||||
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||||
|
|
||||||
# quantize int4 version
|
# quantize int4 version
|
||||||
|
@ -28,8 +28,8 @@ cmake --build build --config Release
|
|||||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||||
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||||
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||||
|
|
||||||
# quantize int4 version
|
# quantize int4 version
|
||||||
|
@ -28,8 +28,8 @@ cmake --build build --config Release
|
|||||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
|
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||||
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
|
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
|
||||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
|
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
|
||||||
|
|
||||||
# quantize int4 version
|
# quantize int4 version
|
||||||
|
@ -12,51 +12,30 @@ llama_add_compile_flags()
|
|||||||
|
|
||||||
# examples
|
# examples
|
||||||
|
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
else()
|
else()
|
||||||
add_subdirectory(batched-bench)
|
|
||||||
add_subdirectory(batched)
|
add_subdirectory(batched)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
|
|
||||||
add_subdirectory(gguf-hash)
|
add_subdirectory(gguf-hash)
|
||||||
add_subdirectory(gguf-split)
|
|
||||||
add_subdirectory(gguf)
|
add_subdirectory(gguf)
|
||||||
add_subdirectory(gritlm)
|
add_subdirectory(gritlm)
|
||||||
add_subdirectory(imatrix)
|
|
||||||
add_subdirectory(infill)
|
|
||||||
add_subdirectory(llama-bench)
|
|
||||||
add_subdirectory(lookahead)
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(lookup)
|
add_subdirectory(lookup)
|
||||||
add_subdirectory(main)
|
|
||||||
add_subdirectory(parallel)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(passkey)
|
add_subdirectory(passkey)
|
||||||
add_subdirectory(perplexity)
|
|
||||||
add_subdirectory(quantize)
|
|
||||||
add_subdirectory(retrieval)
|
add_subdirectory(retrieval)
|
||||||
if (LLAMA_BUILD_SERVER)
|
|
||||||
add_subdirectory(server)
|
|
||||||
endif()
|
|
||||||
add_subdirectory(save-load-state)
|
add_subdirectory(save-load-state)
|
||||||
add_subdirectory(run)
|
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
add_subdirectory(simple-chat)
|
add_subdirectory(simple-chat)
|
||||||
add_subdirectory(speculative)
|
add_subdirectory(speculative)
|
||||||
add_subdirectory(speculative-simple)
|
add_subdirectory(speculative-simple)
|
||||||
add_subdirectory(tokenize)
|
|
||||||
add_subdirectory(tts)
|
|
||||||
add_subdirectory(gen-docs)
|
add_subdirectory(gen-docs)
|
||||||
|
add_subdirectory(training)
|
||||||
if (NOT GGML_BACKEND_DL)
|
if (NOT GGML_BACKEND_DL)
|
||||||
# these examples use the backends directly and cannot be built with dynamic loading
|
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(cvector-generator)
|
# these examples use the backends directly and cannot be built with dynamic loading
|
||||||
add_subdirectory(export-lora)
|
|
||||||
add_subdirectory(llava)
|
|
||||||
if (GGML_RPC)
|
|
||||||
add_subdirectory(rpc)
|
|
||||||
endif()
|
|
||||||
if (GGML_SYCL)
|
if (GGML_SYCL)
|
||||||
add_subdirectory(sycl)
|
add_subdirectory(sycl)
|
||||||
endif()
|
endif()
|
||||||
|
@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for i in 1 ..< n_parallel {
|
for i in 1 ..< n_parallel {
|
||||||
llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
|
llama_memory_seq_cp(llama_get_memory(context), 0, Int32(i), 0, batch.n_tokens)
|
||||||
}
|
}
|
||||||
|
|
||||||
if n_parallel > 1 {
|
if n_parallel > 1 {
|
||||||
|
@ -35,23 +35,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
|||||||
|
|
||||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
||||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||||
const struct llama_model * model = llama_get_model(ctx);
|
|
||||||
|
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
llama_kv_self_clear(ctx);
|
llama_memory_clear(llama_get_memory(ctx), true);
|
||||||
|
|
||||||
// run model
|
// run model
|
||||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||||
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
|
if (llama_decode(ctx, batch) < 0) {
|
||||||
// encoder-only model
|
LOG_ERR("%s : failed to process\n", __func__);
|
||||||
if (llama_encode(ctx, batch) < 0) {
|
|
||||||
LOG_ERR("%s : failed to encode\n", __func__);
|
|
||||||
}
|
|
||||||
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
|
||||||
// decoder-only model
|
|
||||||
if (llama_decode(ctx, batch) < 0) {
|
|
||||||
LOG_ERR("%s : failed to decode\n", __func__);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < batch.n_tokens; i++) {
|
for (int i = 0; i < batch.n_tokens; i++) {
|
||||||
@ -245,9 +236,24 @@ int main(int argc, char ** argv) {
|
|||||||
LOG("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
} else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
|
} else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
|
||||||
|
const uint32_t n_cls_out = llama_model_n_cls_out(model);
|
||||||
|
std::vector<std::string> cls_out_labels;
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_cls_out; i++) {
|
||||||
|
const char * label = llama_model_cls_label(model, i);
|
||||||
|
const std::string label_i(label == nullptr ? "" : label);
|
||||||
|
cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i);
|
||||||
|
}
|
||||||
|
|
||||||
for (int j = 0; j < n_embd_count; j++) {
|
for (int j = 0; j < n_embd_count; j++) {
|
||||||
// NOTE: if you change this log - update the tests in ci/run.sh
|
for (uint32_t i = 0; i < n_cls_out; i++) {
|
||||||
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
|
// NOTE: if you change this log - update the tests in ci/run.sh
|
||||||
|
if (n_cls_out == 1) {
|
||||||
|
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
|
||||||
|
} else {
|
||||||
|
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||||
|
@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|||||||
}
|
}
|
||||||
|
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
llama_kv_self_clear(ctx);
|
llama_memory_clear(llama_get_memory(ctx), true);
|
||||||
llama_set_embeddings(ctx, true);
|
llama_set_embeddings(ctx, true);
|
||||||
llama_set_causal_attn(ctx, false);
|
llama_set_causal_attn(ctx, false);
|
||||||
|
|
||||||
@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
|||||||
|
|
||||||
llama_token eos_token = llama_vocab_eos(vocab);
|
llama_token eos_token = llama_vocab_eos(vocab);
|
||||||
|
|
||||||
llama_kv_self_clear(ctx);
|
llama_memory_clear(llama_get_memory(ctx), true);
|
||||||
llama_set_embeddings(ctx, false);
|
llama_set_embeddings(ctx, false);
|
||||||
llama_set_causal_attn(ctx, true);
|
llama_set_causal_attn(ctx, true);
|
||||||
|
|
||||||
|
@ -1,5 +0,0 @@
|
|||||||
set(TARGET llama-infill)
|
|
||||||
add_executable(${TARGET} infill.cpp)
|
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@ -1,47 +0,0 @@
|
|||||||
# llama.cpp/example/infill
|
|
||||||
|
|
||||||
This example shows how to use the infill mode with Code Llama models supporting infill mode.
|
|
||||||
Currently the 7B and 13B models support infill mode.
|
|
||||||
|
|
||||||
Infill supports most of the options available in the main example.
|
|
||||||
|
|
||||||
For further information have a look at the main README.md in llama.cpp/example/main/README.md
|
|
||||||
|
|
||||||
## Common Options
|
|
||||||
|
|
||||||
In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
|
|
||||||
|
|
||||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
|
||||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
|
||||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
|
||||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
|
|
||||||
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
|
||||||
|
|
||||||
## Input Prompts
|
|
||||||
|
|
||||||
The `infill` program provides several ways to interact with the LLaMA models using input prompts:
|
|
||||||
|
|
||||||
- `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
|
|
||||||
- `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
|
|
||||||
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
|
|
||||||
|
|
||||||
## Interaction
|
|
||||||
|
|
||||||
The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
|
|
||||||
|
|
||||||
### Interaction Options
|
|
||||||
|
|
||||||
- `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
|
|
||||||
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
|
|
||||||
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
|
|
||||||
|
|
||||||
### Example
|
|
||||||
|
|
||||||
Download a model that supports infill, for example CodeLlama:
|
|
||||||
```console
|
|
||||||
scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
|
||||||
```
|
|
@ -1,590 +0,0 @@
|
|||||||
#include "arg.h"
|
|
||||||
#include "common.h"
|
|
||||||
#include "console.h"
|
|
||||||
#include "sampling.h"
|
|
||||||
#include "log.h"
|
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cinttypes>
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <ctime>
|
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <sstream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
||||||
#include <signal.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#elif defined (_WIN32)
|
|
||||||
#define WIN32_LEAN_AND_MEAN
|
|
||||||
#ifndef NOMINMAX
|
|
||||||
#define NOMINMAX
|
|
||||||
#endif
|
|
||||||
#include <windows.h>
|
|
||||||
#include <signal.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static llama_context ** g_ctx;
|
|
||||||
static llama_model ** g_model;
|
|
||||||
static common_sampler ** g_smpl;
|
|
||||||
static common_params * g_params;
|
|
||||||
static std::vector<llama_token> * g_input_tokens;
|
|
||||||
static std::ostringstream * g_output_ss;
|
|
||||||
static std::vector<llama_token> * g_output_tokens;
|
|
||||||
|
|
||||||
static bool is_interacting = false;
|
|
||||||
|
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
||||||
static void sigint_handler(int signo) {
|
|
||||||
if (signo == SIGINT) {
|
|
||||||
if (!is_interacting) {
|
|
||||||
is_interacting = true;
|
|
||||||
} else {
|
|
||||||
console::cleanup();
|
|
||||||
LOG("\n");
|
|
||||||
common_perf_print(*g_ctx, *g_smpl);
|
|
||||||
|
|
||||||
// make sure all logs are flushed
|
|
||||||
LOG("Interrupted by user\n");
|
|
||||||
common_log_pause(common_log_main());
|
|
||||||
|
|
||||||
_exit(130);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
|
||||||
common_params params;
|
|
||||||
g_params = ¶ms;
|
|
||||||
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
common_init();
|
|
||||||
|
|
||||||
auto & sparams = params.sampling;
|
|
||||||
|
|
||||||
console::init(params.simple_io, params.use_color);
|
|
||||||
atexit([]() { console::cleanup(); });
|
|
||||||
|
|
||||||
if (params.logits_all) {
|
|
||||||
LOG_ERR("\n************\n");
|
|
||||||
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
|
||||||
LOG_ERR("************\n\n");
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.embedding) {
|
|
||||||
LOG_ERR("\n************\n");
|
|
||||||
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
|
||||||
LOG_ERR("************\n\n");
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
|
||||||
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
|
|
||||||
params.n_ctx = 8;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
|
||||||
LOG_ERR("\n************\n");
|
|
||||||
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
|
||||||
LOG_ERR("************\n\n");
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.rope_freq_base != 0.0) {
|
|
||||||
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.rope_freq_scale != 0.0) {
|
|
||||||
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_INF("%s: llama backend init\n", __func__);
|
|
||||||
llama_backend_init();
|
|
||||||
llama_numa_init(params.numa);
|
|
||||||
|
|
||||||
llama_model * model = nullptr;
|
|
||||||
llama_context * ctx = nullptr;
|
|
||||||
common_sampler * smpl = nullptr;
|
|
||||||
|
|
||||||
g_model = &model;
|
|
||||||
g_ctx = &ctx;
|
|
||||||
g_smpl = &smpl;
|
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
|
||||||
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
|
||||||
|
|
||||||
model = llama_init.model.get();
|
|
||||||
ctx = llama_init.context.get();
|
|
||||||
|
|
||||||
if (model == NULL) {
|
|
||||||
LOG_ERR("%s: unable to load model\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
||||||
|
|
||||||
const int n_ctx_train = llama_model_n_ctx_train(model);
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
|
||||||
LOG_DBG("n_ctx: %d\n", n_ctx);
|
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
|
||||||
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
// print system information
|
|
||||||
{
|
|
||||||
LOG_INF("\n");
|
|
||||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
||||||
}
|
|
||||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
|
||||||
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
|
||||||
std::vector<llama_token> embd_end;
|
|
||||||
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
|
||||||
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
|
||||||
|
|
||||||
GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
|
|
||||||
GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
|
|
||||||
|
|
||||||
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
|
||||||
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
|
||||||
|
|
||||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
|
||||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
|
||||||
if (add_bos) {
|
|
||||||
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
|
||||||
}
|
|
||||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
||||||
|
|
||||||
const llama_token middle_token = llama_vocab_fim_mid(vocab);
|
|
||||||
if (middle_token >= 0) {
|
|
||||||
embd_inp.push_back(middle_token);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_DBG("add_bos: %d\n", add_bos);
|
|
||||||
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
|
|
||||||
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
|
|
||||||
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
|
||||||
|
|
||||||
// Should not run without any tokens
|
|
||||||
if (embd_inp.empty()) {
|
|
||||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
|
||||||
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
|
||||||
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// number of tokens to keep when resetting context
|
|
||||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
|
|
||||||
params.n_keep = (int)embd_inp.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
|
|
||||||
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
|
|
||||||
|
|
||||||
// enable interactive mode if interactive start is specified
|
|
||||||
if (params.interactive_first) {
|
|
||||||
params.interactive = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.verbose_prompt) {
|
|
||||||
LOG_INF("\n");
|
|
||||||
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
|
||||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
|
||||||
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.n_keep > 0) {
|
|
||||||
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
|
||||||
for (int i = 0; i < params.n_keep; i++) {
|
|
||||||
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
||||||
}
|
|
||||||
LOG_CNT("'\n");
|
|
||||||
}
|
|
||||||
LOG_INF("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.interactive) {
|
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
||||||
struct sigaction sigint_action;
|
|
||||||
sigint_action.sa_handler = sigint_handler;
|
|
||||||
sigemptyset (&sigint_action.sa_mask);
|
|
||||||
sigint_action.sa_flags = 0;
|
|
||||||
sigaction(SIGINT, &sigint_action, NULL);
|
|
||||||
#elif defined (_WIN32)
|
|
||||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
|
||||||
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
|
||||||
};
|
|
||||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
LOG_INF("%s: interactive mode on.\n", __func__);
|
|
||||||
|
|
||||||
if (params.input_prefix_bos) {
|
|
||||||
LOG_INF("Input prefix with BOS\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!params.input_prefix.empty()) {
|
|
||||||
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!params.input_suffix.empty()) {
|
|
||||||
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
smpl = common_sampler_init(model, sparams);
|
|
||||||
|
|
||||||
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
|
|
||||||
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
|
||||||
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
|
|
||||||
|
|
||||||
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
|
||||||
|
|
||||||
LOG_INF("\n");
|
|
||||||
LOG_INF("\n##### Infill mode #####\n\n");
|
|
||||||
if (params.interactive) {
|
|
||||||
const char *control_message;
|
|
||||||
if (params.multiline_input) {
|
|
||||||
control_message = " - To return control to LLaMA, end your input with '\\'.\n"
|
|
||||||
" - To return control without starting a new line, end your input with '/'.\n";
|
|
||||||
} else {
|
|
||||||
control_message = " - Press Return to return control to LLaMA.\n"
|
|
||||||
" - To return control without starting a new line, end your input with '/'.\n"
|
|
||||||
" - If you want to submit another line, end your input with '\\'.\n";
|
|
||||||
}
|
|
||||||
LOG_INF("== Running in interactive mode. ==\n");
|
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
||||||
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
|
||||||
#endif
|
|
||||||
LOG_INF( "%s\n", control_message);
|
|
||||||
|
|
||||||
is_interacting = params.interactive_first;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool input_echo = true;
|
|
||||||
|
|
||||||
int n_past = 0;
|
|
||||||
int n_remain = params.n_predict;
|
|
||||||
int n_consumed = 0;
|
|
||||||
|
|
||||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
|
||||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
|
||||||
std::ostringstream output_ss; g_output_ss = &output_ss;
|
|
||||||
|
|
||||||
// the first thing we will do is to output the prompt, so set color accordingly
|
|
||||||
console::set_display(console::prompt);
|
|
||||||
|
|
||||||
std::vector<llama_token> embd;
|
|
||||||
|
|
||||||
while (n_remain != 0 || params.interactive) {
|
|
||||||
// predict
|
|
||||||
if (!embd.empty()) {
|
|
||||||
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
|
|
||||||
// --prompt or --file which uses the same value.
|
|
||||||
int max_embd_size = n_ctx - 4;
|
|
||||||
|
|
||||||
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
|
||||||
if ((int) embd.size() > max_embd_size) {
|
|
||||||
const int skipped_tokens = (int) embd.size() - max_embd_size;
|
|
||||||
embd.resize(max_embd_size);
|
|
||||||
|
|
||||||
console::set_display(console::error);
|
|
||||||
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
|
||||||
console::set_display(console::reset);
|
|
||||||
}
|
|
||||||
|
|
||||||
// infinite text generation via context swapping
|
|
||||||
// if we run out of context:
|
|
||||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
|
||||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
|
||||||
if (n_past + (int) embd.size() > n_ctx) {
|
|
||||||
if (params.n_predict == -2) {
|
|
||||||
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int n_left = n_past - params.n_keep - 1;
|
|
||||||
const int n_discard = n_left/2;
|
|
||||||
|
|
||||||
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
|
||||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
|
||||||
|
|
||||||
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
|
||||||
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
|
||||||
|
|
||||||
n_past -= n_discard;
|
|
||||||
|
|
||||||
LOG_DBG("after swap: n_past = %d\n", n_past);
|
|
||||||
|
|
||||||
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// evaluate tokens in batches
|
|
||||||
// embd is typically prepared beforehand to fit within a batch, but not always
|
|
||||||
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
|
||||||
int n_eval = (int) embd.size() - i;
|
|
||||||
if (n_eval > params.n_batch) {
|
|
||||||
n_eval = params.n_batch;
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
|
|
||||||
LOG_ERR("%s : failed to eval\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
n_past += n_eval;
|
|
||||||
|
|
||||||
LOG_DBG("n_past = %d\n", n_past);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
embd.clear();
|
|
||||||
|
|
||||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
|
||||||
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
|
||||||
|
|
||||||
common_sampler_accept(smpl, id, true);
|
|
||||||
|
|
||||||
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
|
||||||
|
|
||||||
embd.push_back(id);
|
|
||||||
|
|
||||||
// echo this to console
|
|
||||||
input_echo = true;
|
|
||||||
|
|
||||||
// decrement remaining sampling budget
|
|
||||||
--n_remain;
|
|
||||||
|
|
||||||
LOG_DBG("n_remain: %d\n", n_remain);
|
|
||||||
} else {
|
|
||||||
// some user input remains from prompt or interaction, forward it to processing
|
|
||||||
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
|
||||||
while ((int) embd_inp.size() > n_consumed) {
|
|
||||||
embd.push_back(embd_inp[n_consumed]);
|
|
||||||
|
|
||||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
|
||||||
// for the prompt, we don't apply grammar rules
|
|
||||||
common_sampler_accept(smpl, embd_inp[n_consumed], false);
|
|
||||||
|
|
||||||
++n_consumed;
|
|
||||||
if ((int) embd.size() >= params.n_batch) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// display text
|
|
||||||
if (input_echo) {
|
|
||||||
for (auto id : embd) {
|
|
||||||
const std::string token_str = common_token_to_piece(ctx, id);
|
|
||||||
LOG("%s", token_str.c_str());
|
|
||||||
|
|
||||||
if (embd.size() > 1) {
|
|
||||||
input_tokens.push_back(id);
|
|
||||||
} else {
|
|
||||||
output_tokens.push_back(id);
|
|
||||||
output_ss << token_str;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// reset color to default if we there is no pending user input
|
|
||||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
|
||||||
console::set_display(console::reset);
|
|
||||||
}
|
|
||||||
|
|
||||||
// if not currently processing queued inputs;
|
|
||||||
if ((int) embd_inp.size() <= n_consumed) {
|
|
||||||
// deal with eot token in infill mode
|
|
||||||
if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
|
|
||||||
if (is_interacting && !params.interactive_first) {
|
|
||||||
// print an eot token
|
|
||||||
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
|
||||||
}
|
|
||||||
LOG("\n");
|
|
||||||
console::set_display(console::user_input);
|
|
||||||
std::string buffer;
|
|
||||||
std::string line;
|
|
||||||
bool another_line=true;
|
|
||||||
// set a new prefix via stdin
|
|
||||||
do {
|
|
||||||
another_line = console::readline(line, params.multiline_input);
|
|
||||||
buffer += line;
|
|
||||||
} while (another_line);
|
|
||||||
// check if we got an empty line, if so we use the old input
|
|
||||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
|
||||||
params.input_prefix = buffer;
|
|
||||||
}
|
|
||||||
buffer.clear();
|
|
||||||
// set a new suffix via stdin
|
|
||||||
do {
|
|
||||||
another_line = console::readline(line, params.multiline_input);
|
|
||||||
buffer += line;
|
|
||||||
} while (another_line);
|
|
||||||
// check if we got an empty line
|
|
||||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
|
||||||
params.input_suffix = buffer;
|
|
||||||
}
|
|
||||||
buffer.clear();
|
|
||||||
// done taking input, reset color
|
|
||||||
console::set_display(console::reset);
|
|
||||||
|
|
||||||
if (params.escape) {
|
|
||||||
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
|
||||||
string_process_escapes(params.input_prefix);
|
|
||||||
string_process_escapes(params.input_suffix);
|
|
||||||
}
|
|
||||||
|
|
||||||
// tokenize new prefix and suffix
|
|
||||||
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
|
||||||
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
|
||||||
|
|
||||||
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
|
||||||
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
|
||||||
|
|
||||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
|
||||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
|
||||||
if (add_bos) {
|
|
||||||
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
|
||||||
}
|
|
||||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
||||||
|
|
||||||
if (middle_token >= 0) {
|
|
||||||
embd_inp.push_back(middle_token);
|
|
||||||
}
|
|
||||||
|
|
||||||
embd.clear();
|
|
||||||
n_remain = params.n_predict;
|
|
||||||
n_past = 0;
|
|
||||||
n_consumed = 0;
|
|
||||||
is_interacting = false;
|
|
||||||
}
|
|
||||||
// deal with end of generation tokens in interactive mode
|
|
||||||
else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
|
||||||
LOG_DBG("found EOS token\n");
|
|
||||||
|
|
||||||
if (params.interactive) {
|
|
||||||
|
|
||||||
is_interacting = true;
|
|
||||||
LOG("\n");
|
|
||||||
console::set_display(console::user_input);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (n_past > 0 && is_interacting && !params.interactive) {
|
|
||||||
LOG_DBG("waiting for user input\n");
|
|
||||||
|
|
||||||
if (params.input_prefix_bos) {
|
|
||||||
LOG_DBG("adding input prefix BOS token\n");
|
|
||||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string buffer;
|
|
||||||
if (!params.input_prefix.empty()) {
|
|
||||||
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
|
||||||
buffer += params.input_prefix;
|
|
||||||
LOG("%s", buffer.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string line;
|
|
||||||
bool another_line = true;
|
|
||||||
do {
|
|
||||||
another_line = console::readline(line, params.multiline_input);
|
|
||||||
buffer += line;
|
|
||||||
} while (another_line);
|
|
||||||
|
|
||||||
// done taking input, reset color
|
|
||||||
console::set_display(console::reset);
|
|
||||||
|
|
||||||
// Add tokens to embd only if the input buffer is non-empty
|
|
||||||
// Entering a empty line lets the user pass control back
|
|
||||||
if (buffer.length() > 1) {
|
|
||||||
// append input suffix if any
|
|
||||||
if (!params.input_suffix.empty()) {
|
|
||||||
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
|
||||||
buffer += params.input_suffix;
|
|
||||||
LOG("%s", params.input_suffix.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
|
||||||
|
|
||||||
const size_t original_size = embd_inp.size();
|
|
||||||
|
|
||||||
const auto line_inp = common_tokenize(ctx, buffer, false);
|
|
||||||
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
|
||||||
|
|
||||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
|
||||||
|
|
||||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
|
||||||
const llama_token token = embd_inp[i];
|
|
||||||
output_tokens.push_back(token);
|
|
||||||
output_ss << common_token_to_piece(ctx, token);
|
|
||||||
}
|
|
||||||
|
|
||||||
n_remain -= line_inp.size();
|
|
||||||
LOG_DBG("n_remain: %d\n", n_remain);
|
|
||||||
} else {
|
|
||||||
LOG_DBG("empty line, passing control back\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
input_echo = false; // do not echo this again
|
|
||||||
}
|
|
||||||
|
|
||||||
if (n_past > 0) {
|
|
||||||
if (is_interacting) {
|
|
||||||
common_sampler_reset(smpl);
|
|
||||||
}
|
|
||||||
is_interacting = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// end of generation
|
|
||||||
if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
|
||||||
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
|
|
||||||
if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
|
|
||||||
n_remain = params.n_predict;
|
|
||||||
is_interacting = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!params.interactive && n_remain <= 0) {
|
|
||||||
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG("\n");
|
|
||||||
common_perf_print(ctx, smpl);
|
|
||||||
|
|
||||||
common_sampler_free(smpl);
|
|
||||||
llama_backend_free();
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
|||||||
}
|
}
|
||||||
|
|
||||||
batch->logits[batch->n_tokens - 1] = true;
|
batch->logits[batch->n_tokens - 1] = true;
|
||||||
llama_kv_self_clear(context);
|
llama_memory_clear(llama_get_memory(context), false);
|
||||||
|
|
||||||
const auto t_pp_start = ggml_time_us();
|
const auto t_pp_start = ggml_time_us();
|
||||||
if (llama_decode(context, *batch) != 0) {
|
if (llama_decode(context, *batch) != 0) {
|
||||||
@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
|||||||
|
|
||||||
LOGi("Benchmark text generation (tg)");
|
LOGi("Benchmark text generation (tg)");
|
||||||
|
|
||||||
llama_kv_self_clear(context);
|
llama_memory_clear(llama_get_memory(context), false);
|
||||||
const auto t_tg_start = ggml_time_us();
|
const auto t_tg_start = ggml_time_us();
|
||||||
for (i = 0; i < tg; i++) {
|
for (i = 0; i < tg; i++) {
|
||||||
|
|
||||||
@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
|||||||
|
|
||||||
const auto t_tg_end = ggml_time_us();
|
const auto t_tg_end = ggml_time_us();
|
||||||
|
|
||||||
llama_kv_self_clear(context);
|
llama_memory_clear(llama_get_memory(context), false);
|
||||||
|
|
||||||
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
|
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
|
||||||
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
|
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
|
||||||
@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
|||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||||
llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
|
llama_memory_clear(llama_get_memory(reinterpret_cast<llama_context *>(context)), true);
|
||||||
}
|
}
|
||||||
|
@ -210,7 +210,7 @@ actor LlamaContext {
|
|||||||
}
|
}
|
||||||
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
|
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
|
||||||
|
|
||||||
llama_kv_self_clear(context)
|
llama_memory_clear(llama_get_memory(context), false)
|
||||||
|
|
||||||
let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
|
let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||||
|
|
||||||
@ -223,7 +223,7 @@ actor LlamaContext {
|
|||||||
|
|
||||||
// bench text generation
|
// bench text generation
|
||||||
|
|
||||||
llama_kv_self_clear(context)
|
llama_memory_clear(llama_get_memory(context), false)
|
||||||
|
|
||||||
let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
|
let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||||
|
|
||||||
@ -242,7 +242,7 @@ actor LlamaContext {
|
|||||||
|
|
||||||
let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
|
let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||||
|
|
||||||
llama_kv_self_clear(context)
|
llama_memory_clear(llama_get_memory(context), false)
|
||||||
|
|
||||||
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
|
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
|
||||||
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
|
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
|
||||||
@ -292,7 +292,7 @@ actor LlamaContext {
|
|||||||
func clear() {
|
func clear() {
|
||||||
tokens_list.removeAll()
|
tokens_list.removeAll()
|
||||||
temporary_invalid_cchars.removeAll()
|
temporary_invalid_cchars.removeAll()
|
||||||
llama_kv_self_clear(context)
|
llama_memory_clear(llama_get_memory(context), true)
|
||||||
}
|
}
|
||||||
|
|
||||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||||
|
@ -1,81 +0,0 @@
|
|||||||
# llava (legacy)
|
|
||||||
|
|
||||||
add_library(llava OBJECT
|
|
||||||
llava.cpp
|
|
||||||
llava.h
|
|
||||||
clip.cpp
|
|
||||||
clip.h
|
|
||||||
)
|
|
||||||
|
|
||||||
target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
|
|
||||||
target_include_directories(llava PUBLIC .)
|
|
||||||
target_include_directories(llava PUBLIC ../..)
|
|
||||||
target_include_directories(llava PUBLIC ../../common)
|
|
||||||
|
|
||||||
target_compile_features(llava PRIVATE cxx_std_17)
|
|
||||||
|
|
||||||
add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
|
|
||||||
if (BUILD_SHARED_LIBS)
|
|
||||||
set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
||||||
target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
|
||||||
add_library(llava_shared SHARED $<TARGET_OBJECTS:llava>)
|
|
||||||
target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
install(TARGETS llava_shared LIBRARY)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# mtmd
|
|
||||||
|
|
||||||
add_library(mtmd OBJECT
|
|
||||||
mtmd.cpp
|
|
||||||
mtmd.h
|
|
||||||
clip.cpp
|
|
||||||
clip.h
|
|
||||||
clip-impl.h
|
|
||||||
)
|
|
||||||
|
|
||||||
target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
|
|
||||||
target_include_directories(mtmd PUBLIC .)
|
|
||||||
target_include_directories(mtmd PRIVATE ../..)
|
|
||||||
target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
|
|
||||||
|
|
||||||
target_compile_features(mtmd PRIVATE cxx_std_17)
|
|
||||||
|
|
||||||
add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
|
|
||||||
if (BUILD_SHARED_LIBS)
|
|
||||||
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
||||||
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
|
||||||
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
|
|
||||||
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
install(TARGETS mtmd_shared LIBRARY)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (NOT MSVC)
|
|
||||||
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
|
|
||||||
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(llava BUILD_INFO)
|
|
||||||
add_dependencies(mtmd BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
|
||||||
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
|
||||||
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
|
||||||
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
|
|
||||||
|
|
||||||
set(TARGET llama-mtmd-cli)
|
|
||||||
add_executable(${TARGET} mtmd-cli.cpp)
|
|
||||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
||||||
|
|
||||||
set(TARGET llama-llava-clip-quantize-cli)
|
|
||||||
add_executable(${TARGET} clip-quantize-cli.cpp)
|
|
||||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
|
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@ -1,44 +0,0 @@
|
|||||||
# Quantizing CLIP Visual Projector
|
|
||||||
|
|
||||||
This is the tool for quantizing the CLIP visual projector model. Quantization reduces the precision of the model's weights, which can significantly decrease the model size and improve inference speed, often with minimal impact on performance.
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
To quantize a CLIP visual projector model, use the following command:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf <type>
|
|
||||||
```
|
|
||||||
|
|
||||||
After the quantization, the visual projector can be used freely with the existing LLAVA cli (LLAVA, Qwen2VL, etc).
|
|
||||||
|
|
||||||
### Arguments
|
|
||||||
|
|
||||||
- `/path/to/ggml-model-f32.gguf`: The path to the input model file in FP32 or FP16 format.
|
|
||||||
- `/path/to/ggml-model-quantized.gguf`: The path where the quantized model will be saved.
|
|
||||||
- `<type>`: The quantization type to apply. This should be an integer corresponding to one of the quantization types defined in the `enum ggml_type`.
|
|
||||||
|
|
||||||
### Quantization Types
|
|
||||||
|
|
||||||
The following quantization types are supported, based on the `enum ggml_type` definition:
|
|
||||||
|
|
||||||
- `2` - `q4_0`: 4-bit quantization with a single scale value.
|
|
||||||
- `3` - `q4_1`: 4-bit quantization with a separate scale value for each block.
|
|
||||||
- `6` - `q5_0`: 5-bit quantization with a single scale value.
|
|
||||||
- `7` - `q5_1`: 5-bit quantization with a separate scale value for each block.
|
|
||||||
- `8` - `q8_0`: 8-bit quantization with a single scale value.
|
|
||||||
|
|
||||||
### Example
|
|
||||||
|
|
||||||
To quantize a model using the `q4_0` quantization type, you would run:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf 2
|
|
||||||
```
|
|
||||||
|
|
||||||
This command will generate a quantized model at `/path/to/ggml-model-quantized.gguf` using the `q4_0` quantization method.
|
|
||||||
|
|
||||||
## Notes
|
|
||||||
|
|
||||||
- Quantization can lead to a loss in model accuracy, depending on the chosen quantization type. It is recommended to evaluate the quantized model's performance on your specific task to ensure it meets your requirements.
|
|
||||||
- The quantized model will typically be smaller in size and faster to run, making it more suitable for deployment in resource-constrained environments.
|
|
@ -1,53 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
|
|
||||||
projector_name="mmproj-model-f16.gguf"
|
|
||||||
llama_name="ggml-model-q4_k.gguf"
|
|
||||||
img_dir="/Users/cxt/model/llm"
|
|
||||||
img_name="demo.jpg"
|
|
||||||
prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
|
|
||||||
# img_name="cat.jpeg"
|
|
||||||
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
|
|
||||||
|
|
||||||
program_dir="build_64/bin"
|
|
||||||
binName="llama-mtmd-cli"
|
|
||||||
n_threads=4
|
|
||||||
|
|
||||||
|
|
||||||
deviceDir="/data/local/tmp"
|
|
||||||
saveDir="output"
|
|
||||||
if [ ! -d ${saveDir} ]; then
|
|
||||||
mkdir ${saveDir}
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
function android_run() {
|
|
||||||
# # copy resource into device
|
|
||||||
# adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
|
|
||||||
# adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
|
|
||||||
adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
|
|
||||||
# copy program into device
|
|
||||||
adb push ${program_dir}/${binName} ${deviceDir}/${binName}
|
|
||||||
adb shell "chmod 0777 ${deviceDir}/${binName}"
|
|
||||||
|
|
||||||
# run
|
|
||||||
adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
|
|
||||||
-m ${deviceDir}/${llama_name} \
|
|
||||||
--mmproj ${deviceDir}/${projector_name} \
|
|
||||||
-t ${n_threads} \
|
|
||||||
--image ${deviceDir}/${img_name} \
|
|
||||||
-p \"${prompt}\" \
|
|
||||||
> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
|
|
||||||
adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
|
|
||||||
-m ${deviceDir}/${llama_name} \
|
|
||||||
--mmproj ${deviceDir}/${projector_name} \
|
|
||||||
-t ${n_threads} \
|
|
||||||
--image ${deviceDir}/${img_name} \
|
|
||||||
-p \"${prompt}\" \
|
|
||||||
>> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
|
|
||||||
adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
|
|
||||||
}
|
|
||||||
|
|
||||||
android_run
|
|
||||||
|
|
||||||
echo "android_run is Done!"
|
|
@ -1,8 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
cmake ../../../../ \
|
|
||||||
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
|
|
||||||
-DCMAKE_BUILD_TYPE=Release \
|
|
||||||
-DANDROID_ABI="arm64-v8a" \
|
|
||||||
-DANDROID_PLATFORM=android-23 $1
|
|
||||||
|
|
||||||
make -j4
|
|
@ -1,59 +0,0 @@
|
|||||||
#include "arg.h"
|
|
||||||
#include "base64.hpp"
|
|
||||||
#include "log.h"
|
|
||||||
#include "common.h"
|
|
||||||
#include "sampling.h"
|
|
||||||
#include "clip.h"
|
|
||||||
#include "llava.h"
|
|
||||||
#include "llama.h"
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv) {
|
|
||||||
(void) argc;
|
|
||||||
|
|
||||||
fprintf(stderr, "usage: %s /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf type\n", argv[0]);
|
|
||||||
fprintf(stderr, " type = 2 - q4_0\n");
|
|
||||||
fprintf(stderr, " type = 3 - q4_1\n");
|
|
||||||
fprintf(stderr, " type = 6 - q5_0\n");
|
|
||||||
fprintf(stderr, " type = 7 - q5_1\n");
|
|
||||||
fprintf(stderr, " type = 8 - q8_0\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
|
||||||
if (argc != 4) {
|
|
||||||
print_usage(argc, argv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string fname_inp = argv[1];
|
|
||||||
const std::string fname_out = argv[2];
|
|
||||||
|
|
||||||
const int itype = atoi(argv[3]);
|
|
||||||
|
|
||||||
const int64_t t_main_start_us = ggml_time_us();
|
|
||||||
|
|
||||||
int64_t t_quantize_us = 0;
|
|
||||||
|
|
||||||
// load the model
|
|
||||||
{
|
|
||||||
const int64_t t_start_us = ggml_time_us();
|
|
||||||
|
|
||||||
if (!clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
|
|
||||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
t_quantize_us = ggml_time_us() - t_start_us;
|
|
||||||
}
|
|
||||||
|
|
||||||
// report timing
|
|
||||||
{
|
|
||||||
const int64_t t_main_end_us = ggml_time_us();
|
|
||||||
|
|
||||||
printf("\n");
|
|
||||||
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f);
|
|
||||||
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
@ -1,135 +0,0 @@
|
|||||||
#ifndef CLIP_H
|
|
||||||
#define CLIP_H
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#ifdef LLAMA_SHARED
|
|
||||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
|
||||||
# ifdef LLAMA_BUILD
|
|
||||||
# define CLIP_API __declspec(dllexport)
|
|
||||||
# else
|
|
||||||
# define CLIP_API __declspec(dllimport)
|
|
||||||
# endif
|
|
||||||
# else
|
|
||||||
# define CLIP_API __attribute__ ((visibility ("default")))
|
|
||||||
# endif
|
|
||||||
#else
|
|
||||||
# define CLIP_API
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct clip_ctx;
|
|
||||||
|
|
||||||
struct clip_image_size {
|
|
||||||
int width;
|
|
||||||
int height;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct clip_image_f32;
|
|
||||||
struct clip_image_u8_batch;
|
|
||||||
struct clip_image_f32_batch;
|
|
||||||
|
|
||||||
struct clip_context_params {
|
|
||||||
bool use_gpu;
|
|
||||||
enum ggml_log_level verbosity;
|
|
||||||
};
|
|
||||||
|
|
||||||
// deprecated, use clip_init
|
|
||||||
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
|
|
||||||
|
|
||||||
CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
|
|
||||||
|
|
||||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
|
||||||
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
|
|
||||||
|
|
||||||
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
|
||||||
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
|
||||||
CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
// TODO: should be enum, not string
|
|
||||||
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
|
||||||
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
|
|
||||||
"use clip_n_output_tokens instead");
|
|
||||||
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
|
|
||||||
"use clip_n_output_tokens instead");
|
|
||||||
|
|
||||||
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
||||||
|
|
||||||
// for M-RoPE, this will be the number of token positions in X and Y directions
|
|
||||||
// for other models, X will be the total number of tokens and Y will be 1
|
|
||||||
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
||||||
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
||||||
|
|
||||||
// this should be equal to the embedding dimension of the text model
|
|
||||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
|
||||||
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
|
||||||
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
|
||||||
|
|
||||||
CLIP_API struct clip_image_size * clip_image_size_init();
|
|
||||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
|
||||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
|
||||||
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
|
|
||||||
|
|
||||||
// nx, ny are the output image dimensions
|
|
||||||
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
|
||||||
|
|
||||||
CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
|
|
||||||
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
|
||||||
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
|
||||||
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
|
||||||
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
|
||||||
|
|
||||||
// use for accessing underlay data of clip_image_f32_batch
|
|
||||||
CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
|
|
||||||
CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
|
|
||||||
CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
|
|
||||||
CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
|
|
||||||
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
|
|
||||||
*/
|
|
||||||
CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
|
||||||
|
|
||||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
|
||||||
|
|
||||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
|
||||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
|
||||||
|
|
||||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
|
||||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
|
||||||
|
|
||||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
|
||||||
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
|
||||||
|
|
||||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
|
||||||
|
|
||||||
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
|
||||||
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
|
|
||||||
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
|
||||||
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
|
|
||||||
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // CLIP_H
|
|
@ -1,586 +0,0 @@
|
|||||||
#include "clip.h"
|
|
||||||
#include "llava.h"
|
|
||||||
|
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cerrno>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cstring>
|
|
||||||
#include <limits>
|
|
||||||
#include <vector>
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
#if defined(LLAVA_LOG_OFF)
|
|
||||||
# define LOG_INF(...)
|
|
||||||
# define LOG_WRN(...)
|
|
||||||
# define LOG_ERR(...)
|
|
||||||
# define LOG_DBG(...)
|
|
||||||
#else // defined(LLAVA_LOG_OFF)
|
|
||||||
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
||||||
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
||||||
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
||||||
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
||||||
#endif // defined(LLAVA_LOG_OFF)
|
|
||||||
|
|
||||||
// RGB uint8 image
|
|
||||||
struct clip_image_u8 {
|
|
||||||
int nx;
|
|
||||||
int ny;
|
|
||||||
|
|
||||||
std::vector<uint8_t> buf;
|
|
||||||
};
|
|
||||||
|
|
||||||
// RGB float32 image (NHWC)
|
|
||||||
// Memory layout: RGBRGBRGB...
|
|
||||||
struct clip_image_f32 {
|
|
||||||
int nx;
|
|
||||||
int ny;
|
|
||||||
|
|
||||||
std::vector<float> buf;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct clip_image_grid_shape {
|
|
||||||
int first;
|
|
||||||
int second;
|
|
||||||
};
|
|
||||||
|
|
||||||
// convenience cpp wrapper
|
|
||||||
struct clip_image_f32_batch_deleter {
|
|
||||||
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
|
|
||||||
};
|
|
||||||
typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
|
|
||||||
|
|
||||||
struct clip_image_size_deleter {
|
|
||||||
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
|
|
||||||
};
|
|
||||||
typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Selects the best resolution from a list of possible resolutions based on the original size.
|
|
||||||
*
|
|
||||||
* @param original_size The original size of the image in the format (width, height).
|
|
||||||
* @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
|
|
||||||
* @return The best fit resolution in the format (width, height).
|
|
||||||
*/
|
|
||||||
static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
|
|
||||||
int original_width = original_size.first;
|
|
||||||
int original_height = original_size.second;
|
|
||||||
|
|
||||||
std::pair<int, int> best_fit;
|
|
||||||
int max_effective_resolution = 0;
|
|
||||||
int min_wasted_resolution = std::numeric_limits<int>::max();
|
|
||||||
|
|
||||||
for (const auto& resolution : possible_resolutions) {
|
|
||||||
int width = resolution.first;
|
|
||||||
int height = resolution.second;
|
|
||||||
float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
|
|
||||||
int downscaled_width = static_cast<int>(original_width * scale);
|
|
||||||
int downscaled_height = static_cast<int>(original_height * scale);
|
|
||||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
|
||||||
int wasted_resolution = (width * height) - effective_resolution;
|
|
||||||
// LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
|
||||||
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
|
||||||
max_effective_resolution = effective_resolution;
|
|
||||||
min_wasted_resolution = wasted_resolution;
|
|
||||||
best_fit = resolution;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return best_fit;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Get the anyres image grid shape object
|
|
||||||
*
|
|
||||||
* @param image_size
|
|
||||||
* @param grid_pinpoints
|
|
||||||
* @param image_patch_size
|
|
||||||
* @return <int, int>
|
|
||||||
*/
|
|
||||||
static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) {
|
|
||||||
/**
|
|
||||||
Conversion from gguf flat array to vector:
|
|
||||||
std::vector<std::pair<int, int>> possible_resolutions;
|
|
||||||
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
|
|
||||||
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
|
|
||||||
return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
|
||||||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
|
|
||||||
struct {
|
|
||||||
struct ggml_context * ctx;
|
|
||||||
} model;
|
|
||||||
|
|
||||||
const int32_t image_size = clip_get_image_size(ctx_clip);
|
|
||||||
const int32_t patch_size = clip_get_patch_size(ctx_clip);
|
|
||||||
|
|
||||||
int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
|
|
||||||
|
|
||||||
int num_patches_width = grid_shape.first; // grid 1-4
|
|
||||||
int num_patches_height = grid_shape.second; // grid 1-4
|
|
||||||
|
|
||||||
const size_t num_images = num_patches_width * num_patches_height + 1;
|
|
||||||
|
|
||||||
// TODO: size calculation is not calculated - it's only tens of MB
|
|
||||||
size_t ctx_size = 0;
|
|
||||||
|
|
||||||
{
|
|
||||||
ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
|
|
||||||
ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_init_params params {
|
|
||||||
/*.mem_size =*/ ctx_size,
|
|
||||||
/*.mem_buffer =*/ NULL,
|
|
||||||
/*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API
|
|
||||||
};
|
|
||||||
|
|
||||||
// Python reference code for full unpad:
|
|
||||||
/*
|
|
||||||
base_image_feature = image_feature[0]
|
|
||||||
image_feature = image_feature[1:]
|
|
||||||
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
|
|
||||||
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
|
|
||||||
image_feature = unpad_image(image_feature, image_sizes[image_idx])
|
|
||||||
image_feature = torch.cat((
|
|
||||||
image_feature,
|
|
||||||
self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
|
|
||||||
), dim=-1)
|
|
||||||
image_feature = image_feature.flatten(1, 2).transpose(0, 1)
|
|
||||||
image_feature = torch.cat((base_image_feature, image_feature), dim=0)
|
|
||||||
*/
|
|
||||||
// We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
|
|
||||||
// In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
|
|
||||||
// Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
|
|
||||||
// Once all images are processed to prepended the base_image_features without any changes.
|
|
||||||
|
|
||||||
// Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
|
|
||||||
/*
|
|
||||||
image_feature = image_feature.view(2, 2, 24, 24, 4096)
|
|
||||||
image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
|
|
||||||
image_feature = image_feature.view(2, 24, 2, 24, 4096)
|
|
||||||
image_feature = image_feature.flatten(0, 3)
|
|
||||||
|
|
||||||
// Reshape to 4D tensor by merging the last two dimensions
|
|
||||||
image_feature = image_feature.view(2, 2, 24, 24*4096)
|
|
||||||
image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
|
|
||||||
image_feature = image_feature.view(-1, 4096)
|
|
||||||
*/
|
|
||||||
|
|
||||||
model.ctx = ggml_init(params);
|
|
||||||
|
|
||||||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
|
|
||||||
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
|
||||||
// fill it with the image embeddings, ignoring the base
|
|
||||||
for (size_t i = 1; i < num_images; i++) {
|
|
||||||
size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
|
|
||||||
memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(model.ctx);
|
|
||||||
size_t size_ele = ggml_type_size(GGML_TYPE_F32);
|
|
||||||
|
|
||||||
struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
|
|
||||||
num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
|
|
||||||
num_patches_per_side,
|
|
||||||
num_patches_width,
|
|
||||||
num_patches_height,
|
|
||||||
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
|
|
||||||
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
|
|
||||||
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
|
|
||||||
// ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
|
|
||||||
struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
|
|
||||||
/**
|
|
||||||
At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
|
|
||||||
image_feature = torch.cat((
|
|
||||||
image_feature,
|
|
||||||
self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
|
|
||||||
), dim=-1)
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
// ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
|
|
||||||
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
|
|
||||||
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
|
||||||
ggml_build_forward_expand(gf, flatten);
|
|
||||||
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
|
||||||
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
|
||||||
|
|
||||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
|
||||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
|
||||||
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
|
|
||||||
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
|
|
||||||
|
|
||||||
// Debug: Test single segments
|
|
||||||
// Current findings: sending base image, sending a segment embedding all works similar to python
|
|
||||||
// However, permuted embeddings do not work yet (stride issue?)
|
|
||||||
// memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
|
|
||||||
// memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
|
|
||||||
// *n_img_pos_out=576;
|
|
||||||
|
|
||||||
ggml_free(model.ctx);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
|
|
||||||
int width = image->nx;
|
|
||||||
int height = image->ny;
|
|
||||||
int num_patches = (height / patch_size) * (width / patch_size);
|
|
||||||
clip_image_f32 * patch = clip_image_f32_init();
|
|
||||||
patch->nx = patch_size * num_patches;
|
|
||||||
patch->ny = patch_size;
|
|
||||||
patch->buf.resize(3 * patch->nx * patch->ny);
|
|
||||||
|
|
||||||
int patch_index = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < height; i += patch_size) {
|
|
||||||
for (int j = 0; j < width; j += patch_size) {
|
|
||||||
for (int pi = 0; pi < patch_size; ++pi) {
|
|
||||||
for (int pj = 0; pj < patch_size; ++pj) {
|
|
||||||
int input_index = ((i + pi) * width + (j + pj)) * 3;
|
|
||||||
int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
|
|
||||||
patch->buf[output_index] = image->buf[input_index];
|
|
||||||
patch->buf[output_index+1] = image->buf[input_index+1];
|
|
||||||
patch->buf[output_index+2] = image->buf[input_index+2];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
patch_index++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return patch;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
|
||||||
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
|
|
||||||
clip_image_f32_batch_ptr img_res_v(clip_image_f32_batch_init());
|
|
||||||
if (!clip_image_preprocess(ctx_clip, img, img_res_v.get())) {
|
|
||||||
LOG_ERR("%s: unable to preprocess image\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t t_img_enc_start_us = ggml_time_us();
|
|
||||||
|
|
||||||
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
|
||||||
|
|
||||||
const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
|
|
||||||
|
|
||||||
if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
|
|
||||||
std::vector<float *> image_embd_v;
|
|
||||||
image_embd_v.resize(n_imgs);
|
|
||||||
clip_image_size load_image_size;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < n_imgs; i++) {
|
|
||||||
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
|
||||||
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
|
|
||||||
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
|
|
||||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
|
||||||
int patch_size = 14;
|
|
||||||
load_image_size.width = nx;
|
|
||||||
load_image_size.height = ny;
|
|
||||||
clip_add_load_image_size(ctx_clip, &load_image_size);
|
|
||||||
|
|
||||||
bool encoded = false;
|
|
||||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
|
|
||||||
if (clip_is_qwen2vl(ctx_clip)) {
|
|
||||||
encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!encoded) {
|
|
||||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
|
||||||
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
|
||||||
}
|
|
||||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
|
||||||
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
|
||||||
|
|
||||||
int n_img_pos_out = 0;
|
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
|
||||||
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
|
|
||||||
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
|
|
||||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
|
|
||||||
std::memcpy(
|
|
||||||
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
|
|
||||||
image_embd_v[i],
|
|
||||||
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
|
||||||
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
|
|
||||||
}
|
|
||||||
*n_img_pos = n_img_pos_out;
|
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
|
||||||
free(image_embd_v[i]);
|
|
||||||
}
|
|
||||||
image_embd_v.clear();
|
|
||||||
load_image_size.width = img->nx;
|
|
||||||
load_image_size.height = img->ny;
|
|
||||||
clip_add_load_image_size(ctx_clip, &load_image_size);
|
|
||||||
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height);
|
|
||||||
}
|
|
||||||
else if (clip_is_glm(ctx_clip)){
|
|
||||||
struct clip_image_size * load_image_size = clip_image_size_init();
|
|
||||||
load_image_size->width = clip_image_f32_batch_nx(img_res_v.get(), 0);
|
|
||||||
load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0);
|
|
||||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
|
||||||
|
|
||||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
|
||||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
|
|
||||||
int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2);
|
|
||||||
*n_img_pos = (pos * pos + 2);
|
|
||||||
if (!encoded){
|
|
||||||
LOG_ERR("Unable to encode image \n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
|
||||||
// flat / default llava-1.5 type embedding
|
|
||||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
|
||||||
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
|
|
||||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
|
|
||||||
if (!encoded) {
|
|
||||||
LOG_ERR("Unable to encode image\n");
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// spatial_unpad llava-1.6 type embedding
|
|
||||||
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
|
|
||||||
std::vector<float *> image_embd_v;
|
|
||||||
image_embd_v.resize(n_imgs);
|
|
||||||
for (size_t i = 0; i < n_imgs; i++) {
|
|
||||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
|
|
||||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
|
||||||
const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
|
||||||
if (!encoded) {
|
|
||||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
|
||||||
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
|
||||||
|
|
||||||
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
|
||||||
const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
|
|
||||||
|
|
||||||
std::vector<std::pair<int, int>> grid_pinpoints;
|
|
||||||
for (size_t i = 0; i < num_gridpoints; i += 2) {
|
|
||||||
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t image_size = clip_get_image_size(ctx_clip);
|
|
||||||
|
|
||||||
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
|
||||||
|
|
||||||
int n_img_pos_out;
|
|
||||||
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
|
|
||||||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
|
|
||||||
*n_img_pos = n_img_pos_out;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
|
||||||
free(image_embd_v[i]);
|
|
||||||
}
|
|
||||||
image_embd_v.clear();
|
|
||||||
|
|
||||||
// debug image/segment/normalization content:
|
|
||||||
// clip_image_u8 * tmp = clip_image_u8_init();
|
|
||||||
// clip_image_convert_f32_to_u8(*image_feature, *tmp);
|
|
||||||
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
|
||||||
|
|
||||||
const int64_t t_img_enc_end_us = ggml_time_us();
|
|
||||||
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
|
||||||
|
|
||||||
LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
|
|
||||||
// make sure that the correct mmproj was used, i.e., compare apples to apples
|
|
||||||
int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama));
|
|
||||||
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
|
||||||
if (n_image_embd != n_llama_embd) {
|
|
||||||
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
|
||||||
// Granite vision uses up to 10 patches + base patch
|
|
||||||
int num_max_patches = 11;
|
|
||||||
if (clip_is_minicpmv(ctx_clip)) {
|
|
||||||
num_max_patches = 10;
|
|
||||||
}
|
|
||||||
if (clip_is_glm(ctx_clip)) {
|
|
||||||
num_max_patches = 1;
|
|
||||||
}
|
|
||||||
float * image_embd;
|
|
||||||
if (clip_is_qwen2vl(ctx_clip)) {
|
|
||||||
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
|
|
||||||
image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
|
|
||||||
} else {
|
|
||||||
image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
|
||||||
}
|
|
||||||
if (!image_embd) {
|
|
||||||
LOG_ERR("Unable to allocate memory for image embeddings\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
int n_img_pos;
|
|
||||||
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
|
||||||
LOG_ERR("%s: cannot encode image, aborting\n", __func__);
|
|
||||||
free(image_embd);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
*image_embd_out = image_embd;
|
|
||||||
*n_img_pos_out = n_img_pos;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llava_embd_batch {
|
|
||||||
std::vector<llama_pos> pos;
|
|
||||||
std::vector<int32_t> n_seq_id;
|
|
||||||
std::vector<llama_seq_id> seq_id_0;
|
|
||||||
std::vector<llama_seq_id *> seq_ids;
|
|
||||||
std::vector<int8_t> logits;
|
|
||||||
llama_batch batch;
|
|
||||||
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
|
||||||
pos .resize(n_tokens);
|
|
||||||
n_seq_id.resize(n_tokens);
|
|
||||||
seq_ids .resize(n_tokens + 1);
|
|
||||||
logits .resize(n_tokens);
|
|
||||||
seq_id_0.resize(1);
|
|
||||||
seq_id_0[0] = seq_id;
|
|
||||||
seq_ids [n_tokens] = nullptr;
|
|
||||||
batch = {
|
|
||||||
/*n_tokens =*/ n_tokens,
|
|
||||||
/*tokens =*/ nullptr,
|
|
||||||
/*embd =*/ embd,
|
|
||||||
/*pos =*/ pos.data(),
|
|
||||||
/*n_seq_id =*/ n_seq_id.data(),
|
|
||||||
/*seq_id =*/ seq_ids.data(),
|
|
||||||
/*logits =*/ logits.data(),
|
|
||||||
};
|
|
||||||
for (int i = 0; i < n_tokens; i++) {
|
|
||||||
batch.pos [i] = pos_0 + i;
|
|
||||||
batch.n_seq_id[i] = 1;
|
|
||||||
batch.seq_id [i] = seq_id_0.data();
|
|
||||||
batch.logits [i] = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
|
|
||||||
int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
|
|
||||||
|
|
||||||
for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
|
|
||||||
int n_eval = image_embed->n_image_pos - i;
|
|
||||||
if (n_eval > n_batch) {
|
|
||||||
n_eval = n_batch;
|
|
||||||
}
|
|
||||||
float * embd = image_embed->embed+i*n_embd;
|
|
||||||
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
|
|
||||||
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
|
||||||
LOG_ERR("%s : failed to eval\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
*n_past += n_eval;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
|
|
||||||
clip_image_u8 * img = clip_image_u8_init();
|
|
||||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
|
||||||
clip_image_u8_free(img);
|
|
||||||
LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
float* image_embed = NULL;
|
|
||||||
int n_image_pos = 0;
|
|
||||||
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
|
||||||
if (!image_embed_result) {
|
|
||||||
clip_image_u8_free(img);
|
|
||||||
LOG_ERR("%s: couldn't embed the image\n", __func__);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
clip_image_u8_free(img);
|
|
||||||
auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
|
||||||
result->embed = image_embed;
|
|
||||||
result->n_image_pos = n_image_pos;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
|
|
||||||
auto file = fopen(path, "rb");
|
|
||||||
if (file == NULL) {
|
|
||||||
LOG_ERR("%s: can't read file %s\n", __func__, path);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
fseek(file, 0, SEEK_END);
|
|
||||||
auto fileSize = ftell(file);
|
|
||||||
fseek(file, 0, SEEK_SET);
|
|
||||||
|
|
||||||
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
|
|
||||||
if (buffer == NULL) {
|
|
||||||
LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
|
||||||
perror("Memory allocation error");
|
|
||||||
fclose(file);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
errno = 0;
|
|
||||||
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
|
|
||||||
if (ferror(file)) {
|
|
||||||
LOG_ERR("read error: %s", strerror(errno));
|
|
||||||
free(buffer);
|
|
||||||
fclose(file);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (ret != (size_t) fileSize) {
|
|
||||||
LOG_ERR("unexpectedly reached end of file");
|
|
||||||
free(buffer);
|
|
||||||
fclose(file);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
fclose(file); // Close the file
|
|
||||||
|
|
||||||
*bytesOut = buffer;
|
|
||||||
*sizeOut = fileSize;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
|
|
||||||
unsigned char* image_bytes;
|
|
||||||
long image_bytes_length;
|
|
||||||
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
|
||||||
if (!loaded) {
|
|
||||||
LOG_ERR("%s: failed to load %s\n", __func__, image_path);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
|
|
||||||
free(image_bytes);
|
|
||||||
|
|
||||||
return embed;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llava_image_embed_free(struct llava_image_embed * embed) {
|
|
||||||
free(embed->embed);
|
|
||||||
free(embed);
|
|
||||||
}
|
|
@ -1,49 +0,0 @@
|
|||||||
#ifndef LLAVA_H
|
|
||||||
#define LLAVA_H
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
#ifdef LLAMA_SHARED
|
|
||||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
|
||||||
# ifdef LLAMA_BUILD
|
|
||||||
# define LLAVA_API __declspec(dllexport)
|
|
||||||
# else
|
|
||||||
# define LLAVA_API __declspec(dllimport)
|
|
||||||
# endif
|
|
||||||
# else
|
|
||||||
# define LLAVA_API __attribute__ ((visibility ("default")))
|
|
||||||
# endif
|
|
||||||
#else
|
|
||||||
# define LLAVA_API
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct clip_ctx;
|
|
||||||
struct llava_image_embed {
|
|
||||||
float * embed;
|
|
||||||
int n_image_pos;
|
|
||||||
};
|
|
||||||
|
|
||||||
/** sanity check for clip <-> llava embed size match */
|
|
||||||
LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
|
|
||||||
|
|
||||||
LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
|
||||||
|
|
||||||
/** build an image embed from image file bytes */
|
|
||||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
|
||||||
/** build an image embed from a path to an image filename */
|
|
||||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
|
||||||
/** free an embedding made with llava_image_embed_make_* */
|
|
||||||
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
|
||||||
|
|
||||||
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
|
||||||
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,708 +0,0 @@
|
|||||||
#include "clip.h"
|
|
||||||
#include "clip-impl.h"
|
|
||||||
#include "mtmd.h"
|
|
||||||
|
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cerrno>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cstring>
|
|
||||||
#include <limits>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
|
|
||||||
// models not having it (llava-1.6) will process embeddings without any special tokens in-between
|
|
||||||
enum mtmd_slice_tmpl {
|
|
||||||
MTMD_SLICE_TMPL_NONE,
|
|
||||||
MTMD_SLICE_TMPL_MINICPMV_2_5,
|
|
||||||
MTMD_SLICE_TMPL_MINICPMV_2_6,
|
|
||||||
// TODO @ngxson : add support for idefics (SmolVLM)
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mtmd_context {
|
|
||||||
struct clip_ctx * ctx_clip;
|
|
||||||
const struct llama_model * text_model;
|
|
||||||
std::vector<float> image_embd_v; // image embedding vector
|
|
||||||
|
|
||||||
bool print_timings;
|
|
||||||
int n_threads;
|
|
||||||
std::string image_marker;
|
|
||||||
|
|
||||||
// for minicpmv, we need special tokens in-between slices
|
|
||||||
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
|
||||||
llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
|
|
||||||
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
|
|
||||||
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
|
|
||||||
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
|
|
||||||
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
|
|
||||||
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
|
|
||||||
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
|
|
||||||
|
|
||||||
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
|
|
||||||
|
|
||||||
// TODO @ngxson : add timings
|
|
||||||
|
|
||||||
mtmd_context(const char * mmproj_fname,
|
|
||||||
const llama_model * text_model,
|
|
||||||
const mtmd_context_params & ctx_params) :
|
|
||||||
text_model (text_model),
|
|
||||||
print_timings(ctx_params.print_timings),
|
|
||||||
n_threads (ctx_params.n_threads),
|
|
||||||
image_marker (ctx_params.image_marker)
|
|
||||||
{
|
|
||||||
clip_context_params ctx_clip_params;
|
|
||||||
ctx_clip_params.use_gpu = ctx_params.use_gpu;
|
|
||||||
ctx_clip_params.verbosity = ctx_params.verbosity;
|
|
||||||
ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
|
|
||||||
if (!ctx_clip) {
|
|
||||||
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
|
|
||||||
}
|
|
||||||
|
|
||||||
use_mrope = clip_is_qwen2vl(ctx_clip);
|
|
||||||
|
|
||||||
int minicpmv_version = clip_is_minicpmv(ctx_clip);
|
|
||||||
if (minicpmv_version == 2) {
|
|
||||||
// minicpmv 2.5 format:
|
|
||||||
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
|
||||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
|
|
||||||
tok_ov_img_start = lookup_token("<image>");
|
|
||||||
tok_ov_img_end = lookup_token("</image>");
|
|
||||||
tok_slices_start = lookup_token("<slice>");
|
|
||||||
tok_slices_end = lookup_token("</slice>");
|
|
||||||
tok_sli_img_start = tok_ov_img_start;
|
|
||||||
tok_sli_img_end = tok_ov_img_end;
|
|
||||||
tok_row_end = lookup_token("\n");
|
|
||||||
|
|
||||||
} else if (minicpmv_version == 3 || minicpmv_version == 4) {
|
|
||||||
// minicpmv 2.6 format:
|
|
||||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
|
||||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
|
||||||
tok_ov_img_start = lookup_token("<image>");
|
|
||||||
tok_ov_img_end = lookup_token("</image>");
|
|
||||||
tok_sli_img_start = lookup_token("<slice>");
|
|
||||||
tok_sli_img_end = lookup_token("</slice>");
|
|
||||||
tok_row_end = lookup_token("\n");
|
|
||||||
|
|
||||||
} else if (minicpmv_version != 0) {
|
|
||||||
GGML_ASSERT(false && "unsupported minicpmv version");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
~mtmd_context() {
|
|
||||||
clip_free(ctx_clip);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
llama_token lookup_token(const std::string & token_text) {
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(text_model);
|
|
||||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
||||||
for (int i = 0; i < n_vocab; i++) {
|
|
||||||
if (token_to_piece(vocab, i, true) == token_text) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return LLAMA_TOKEN_NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
|
|
||||||
std::string piece;
|
|
||||||
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
|
||||||
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
|
||||||
if (n_chars < 0) {
|
|
||||||
piece.resize(-n_chars);
|
|
||||||
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
|
||||||
GGML_ASSERT(check == -n_chars);
|
|
||||||
} else {
|
|
||||||
piece.resize(n_chars);
|
|
||||||
}
|
|
||||||
return piece;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mtmd_image_tokens_data {
|
|
||||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mtmd_image_tokens {
|
|
||||||
uint32_t nx; // number of tokens in x direction
|
|
||||||
uint32_t ny; // number of tokens in y direction
|
|
||||||
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
|
|
||||||
uint32_t n_tokens() const { return nx * ny; }
|
|
||||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
|
||||||
std::string id; // optional user-defined ID, useful for KV cache tracking
|
|
||||||
};
|
|
||||||
|
|
||||||
mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
|
||||||
const struct llama_model * text_model,
|
|
||||||
const struct mtmd_context_params ctx_params) {
|
|
||||||
try {
|
|
||||||
return new mtmd_context(mmproj_fname, text_model, ctx_params);
|
|
||||||
} catch (const std::exception & e) {
|
|
||||||
LOG_ERR("%s: error: %s\n", __func__, e.what());
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void mtmd_free(mtmd_context * ctx) {
|
|
||||||
if (ctx) {
|
|
||||||
delete ctx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// copied from common_tokenize
|
|
||||||
static std::vector<llama_token> mtmd_tokenize_text_internal(
|
|
||||||
const struct llama_vocab * vocab,
|
|
||||||
const std::string & text,
|
|
||||||
bool add_special,
|
|
||||||
bool parse_special) {
|
|
||||||
// upper limit for the number of tokens
|
|
||||||
int n_tokens = text.length() + 2 * add_special;
|
|
||||||
std::vector<llama_token> result(n_tokens);
|
|
||||||
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
||||||
if (n_tokens < 0) {
|
|
||||||
result.resize(-n_tokens);
|
|
||||||
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
||||||
GGML_ASSERT(check == -n_tokens);
|
|
||||||
} else {
|
|
||||||
result.resize(n_tokens);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
||||||
std::vector<mtmd_input_chunk> & output,
|
|
||||||
const mtmd_input_text & text,
|
|
||||||
const std::vector<mtmd_bitmap> & bitmaps) {
|
|
||||||
auto vocab = llama_model_get_vocab(ctx->text_model);
|
|
||||||
|
|
||||||
std::string prompt_modified(text.text);
|
|
||||||
std::string marker_modified(ctx->image_marker);
|
|
||||||
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
|
||||||
|
|
||||||
// a bit hacky here, but works for now
|
|
||||||
// for some models, we need to add prefix and suffix to the image embeddings
|
|
||||||
if (clip_is_gemma3(ctx->ctx_clip)) {
|
|
||||||
// gemma 3
|
|
||||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
|
||||||
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
|
|
||||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
||||||
|
|
||||||
} else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
||||||
// <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
|
|
||||||
marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
|
|
||||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
||||||
|
|
||||||
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
|
||||||
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
|
||||||
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
|
|
||||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
||||||
|
|
||||||
} else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
|
||||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
|
||||||
marker_modified = ctx->image_marker + "[IMG_END]";
|
|
||||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
||||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
|
||||||
marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
|
|
||||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
|
||||||
|
|
||||||
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
|
|
||||||
output.clear();
|
|
||||||
output.reserve(parts.size());
|
|
||||||
|
|
||||||
size_t i_img = 0;
|
|
||||||
|
|
||||||
// utility for adding raw tokens
|
|
||||||
auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
|
|
||||||
mtmd_input_chunk chunk{
|
|
||||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
||||||
std::move(tokens),
|
|
||||||
{},
|
|
||||||
};
|
|
||||||
output.emplace_back(std::move(chunk));
|
|
||||||
};
|
|
||||||
|
|
||||||
// utility for splitting batch of multiple images into chunks of batch having single images
|
|
||||||
auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
|
|
||||||
std::vector<mtmd_input_chunk> chunks;
|
|
||||||
|
|
||||||
for (auto & entry : batch_f32.entries) {
|
|
||||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
|
||||||
image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
|
|
||||||
image_tokens->ny = 1;
|
|
||||||
image_tokens->batch_f32.entries.push_back(std::move(entry));
|
|
||||||
image_tokens->id = id;
|
|
||||||
|
|
||||||
mtmd_input_chunk chunk{
|
|
||||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
||||||
{},
|
|
||||||
std::move(image_tokens),
|
|
||||||
};
|
|
||||||
chunks.emplace_back(std::move(chunk));
|
|
||||||
}
|
|
||||||
|
|
||||||
return chunks;
|
|
||||||
};
|
|
||||||
|
|
||||||
for (const auto & part : parts) {
|
|
||||||
// printf("tokenizing part: %s\n", part.c_str());
|
|
||||||
bool add_bos = &parts.front() == ∂
|
|
||||||
auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
|
|
||||||
if (tokens.empty()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
mtmd_input_chunk chunk{
|
|
||||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
||||||
std::move(tokens),
|
|
||||||
{},
|
|
||||||
};
|
|
||||||
output.emplace_back(std::move(chunk));
|
|
||||||
|
|
||||||
if (&parts.back() != &part) {
|
|
||||||
// add image token to middle of 2 parts
|
|
||||||
|
|
||||||
if (i_img >= bitmaps.size()) {
|
|
||||||
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// convert mtmd_bitmap to clip_image_u8
|
|
||||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
|
||||||
img_u8->nx = bitmaps[i_img].nx;
|
|
||||||
img_u8->ny = bitmaps[i_img].ny;
|
|
||||||
img_u8->buf.resize(bitmaps[i_img].data.size());
|
|
||||||
std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
|
|
||||||
clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
|
|
||||||
|
|
||||||
// preprocess image
|
|
||||||
clip_image_f32_batch batch_f32;
|
|
||||||
bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
|
|
||||||
if (!ok) {
|
|
||||||
LOG_ERR("Unable to preprocess image\n");
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
|
|
||||||
// split batch into chunks of single images
|
|
||||||
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img].id);
|
|
||||||
GGML_ASSERT(chunks.size() > 0);
|
|
||||||
|
|
||||||
// add overview image
|
|
||||||
add_text_chunk({ctx->tok_ov_img_start});
|
|
||||||
output.emplace_back(std::move(chunks.front()));
|
|
||||||
chunks.erase(chunks.begin());
|
|
||||||
add_text_chunk({ctx->tok_ov_img_end});
|
|
||||||
|
|
||||||
// add slices
|
|
||||||
if (!chunks.empty()) {
|
|
||||||
clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
|
|
||||||
int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
|
|
||||||
int n_row = (int)chunks.size() / n_col;
|
|
||||||
GGML_ASSERT(n_row * n_col == (int)chunks.size());
|
|
||||||
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
|
||||||
add_text_chunk({ctx->tok_slices_start});
|
|
||||||
}
|
|
||||||
for (int y = 0; y < n_row; y++) {
|
|
||||||
for (int x = 0; x < n_col; x++) {
|
|
||||||
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
|
|
||||||
add_text_chunk({ctx->tok_sli_img_start});
|
|
||||||
}
|
|
||||||
output.emplace_back(std::move(chunks[y * n_col + x]));
|
|
||||||
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
|
|
||||||
add_text_chunk({ctx->tok_sli_img_end});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
|
|
||||||
add_text_chunk({ctx->tok_row_end});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
|
|
||||||
add_text_chunk({ctx->tok_slices_end});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
size_t n_tokens = 0;
|
|
||||||
for (const auto & entry : batch_f32.entries) {
|
|
||||||
n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
|
||||||
if (ctx->use_mrope) {
|
|
||||||
// for Qwen2VL, we need this information for M-RoPE decoding positions
|
|
||||||
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
|
|
||||||
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
|
|
||||||
image_tokens->use_mrope_pos = true;
|
|
||||||
} else {
|
|
||||||
// other models, we only need the total number of tokens
|
|
||||||
image_tokens->nx = n_tokens;
|
|
||||||
image_tokens->ny = 1;
|
|
||||||
}
|
|
||||||
image_tokens->batch_f32 = std::move(batch_f32);
|
|
||||||
image_tokens->id = bitmaps[i_img].id; // optional
|
|
||||||
|
|
||||||
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
|
|
||||||
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
|
||||||
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
|
|
||||||
|
|
||||||
mtmd_input_chunk chunk{
|
|
||||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
||||||
{},
|
|
||||||
std::move(image_tokens),
|
|
||||||
};
|
|
||||||
output.emplace_back(std::move(chunk));
|
|
||||||
}
|
|
||||||
|
|
||||||
i_img++; // move to next image
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
|
|
||||||
if (image_tokens) {
|
|
||||||
delete image_tokens;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
|
|
||||||
return image_tokens->n_tokens();
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
|
|
||||||
return image_tokens->nx;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
|
|
||||||
return image_tokens->ny;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
|
|
||||||
return image_tokens->id;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
|
|
||||||
if (image_tokens->use_mrope_pos) {
|
|
||||||
return 1; // for M-RoPE, the whole image is 1 in temporal dimension
|
|
||||||
}
|
|
||||||
return image_tokens->n_tokens();
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
|
||||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
|
||||||
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
|
||||||
bool ok = false;
|
|
||||||
|
|
||||||
// only effective for minicpmv and qwen2vl, other models will ignore load_image_size
|
|
||||||
{
|
|
||||||
clip_image_size slice_size{
|
|
||||||
image_tokens->batch_f32.entries[0]->nx,
|
|
||||||
image_tokens->batch_f32.entries[0]->ny};
|
|
||||||
clip_add_load_image_size(ctx->ctx_clip, &slice_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
|
|
||||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
|
||||||
const auto & entries = image_tokens->batch_f32.entries;
|
|
||||||
for (size_t i = 0; i < entries.size(); i++) {
|
|
||||||
int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
|
|
||||||
ok = clip_image_encode(
|
|
||||||
ctx->ctx_clip,
|
|
||||||
ctx->n_threads,
|
|
||||||
entries[i].get(),
|
|
||||||
ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
ok = clip_image_batch_encode(
|
|
||||||
ctx->ctx_clip,
|
|
||||||
ctx->n_threads,
|
|
||||||
&image_tokens->batch_f32,
|
|
||||||
ctx->image_embd_v.data());
|
|
||||||
}
|
|
||||||
|
|
||||||
return ok ? 0 : 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
float * mtmd_get_output_embd(mtmd_context * ctx) {
|
|
||||||
return ctx->image_embd_v.data();
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
|
|
||||||
size_t n_tokens = 0;
|
|
||||||
for (auto & chunk : chunks) {
|
|
||||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
||||||
n_tokens += chunk.tokens_text.size();
|
|
||||||
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
||||||
n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false && "chunk type not supported");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return n_tokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) {
|
|
||||||
llama_pos n_pos = 0;
|
|
||||||
for (auto & chunk : chunks) {
|
|
||||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
||||||
n_pos += chunk.tokens_text.size();
|
|
||||||
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
||||||
n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false && "chunk type not supported");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return n_pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
// helper struct to make working with embd batch easier
|
|
||||||
// note: this will be removed after llama_batch_ext refactoring
|
|
||||||
struct decode_embd_batch {
|
|
||||||
int n_pos_per_embd;
|
|
||||||
int n_mmproj_embd;
|
|
||||||
std::vector<llama_pos> pos;
|
|
||||||
std::vector<llama_pos> pos_view; // used by mrope
|
|
||||||
std::vector<int32_t> n_seq_id;
|
|
||||||
std::vector<llama_seq_id> seq_id_0;
|
|
||||||
std::vector<llama_seq_id *> seq_ids;
|
|
||||||
std::vector<int8_t> logits;
|
|
||||||
llama_batch batch;
|
|
||||||
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
|
||||||
pos .resize(n_tokens * n_pos_per_embd);
|
|
||||||
n_seq_id.resize(n_tokens);
|
|
||||||
seq_ids .resize(n_tokens + 1);
|
|
||||||
logits .resize(n_tokens);
|
|
||||||
seq_id_0.resize(1);
|
|
||||||
seq_ids [n_tokens] = nullptr;
|
|
||||||
batch = {
|
|
||||||
/*n_tokens =*/ n_tokens,
|
|
||||||
/*tokens =*/ nullptr,
|
|
||||||
/*embd =*/ embd,
|
|
||||||
/*pos =*/ pos.data(),
|
|
||||||
/*n_seq_id =*/ n_seq_id.data(),
|
|
||||||
/*seq_id =*/ seq_ids.data(),
|
|
||||||
/*logits =*/ logits.data(),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
|
|
||||||
seq_id_0[0] = seq_id;
|
|
||||||
for (int i = 0; i < batch.n_tokens; i++) {
|
|
||||||
batch.pos [i] = pos_0 + i;
|
|
||||||
batch.n_seq_id[i] = 1;
|
|
||||||
batch.seq_id [i] = seq_id_0.data();
|
|
||||||
batch.logits [i] = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
|
||||||
GGML_ASSERT(n_pos_per_embd == 4);
|
|
||||||
seq_id_0[0] = seq_id;
|
|
||||||
for (int y = 0; y < ny; y++) {
|
|
||||||
for (int x = 0; x < nx; x++) {
|
|
||||||
int i = y * nx + x;
|
|
||||||
pos[i ] = pos_0;
|
|
||||||
pos[i + batch.n_tokens ] = pos_0 + y;
|
|
||||||
pos[i + batch.n_tokens * 2] = pos_0 + x;
|
|
||||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int i = 0; i < batch.n_tokens; i++) {
|
|
||||||
batch.n_seq_id[i] = 1;
|
|
||||||
batch.seq_id [i] = seq_id_0.data();
|
|
||||||
batch.logits [i] = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_batch get_view(int offset, int n_tokens) {
|
|
||||||
llama_pos * pos_ptr;
|
|
||||||
pos_view.clear();
|
|
||||||
pos_view.resize(n_tokens * n_pos_per_embd);
|
|
||||||
if (n_pos_per_embd > 1) {
|
|
||||||
// mrope
|
|
||||||
// for example, with layout of src: 1234...1234...1234...1234...
|
|
||||||
// offset 2 will give us dst: 34...34...34...34...
|
|
||||||
for (int i = 0; i < n_pos_per_embd; i++) {
|
|
||||||
auto src = pos.begin() + i * batch.n_tokens + offset;
|
|
||||||
pos_view.insert(pos_view.end(), src, src + n_tokens);
|
|
||||||
}
|
|
||||||
pos_ptr = pos_view.data();
|
|
||||||
} else {
|
|
||||||
// normal
|
|
||||||
pos_ptr = pos.data() + offset;
|
|
||||||
}
|
|
||||||
return {
|
|
||||||
/*n_tokens =*/ n_tokens,
|
|
||||||
/*tokens =*/ nullptr,
|
|
||||||
/*embd =*/ batch.embd + offset * n_mmproj_embd,
|
|
||||||
/*pos =*/ pos_ptr,
|
|
||||||
/*n_seq_id =*/ batch.n_seq_id + offset,
|
|
||||||
/*seq_id =*/ batch.seq_id + offset,
|
|
||||||
/*logits =*/ batch.logits + offset,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
int32_t mtmd_helper_eval(mtmd_context * ctx,
|
|
||||||
llama_context * lctx,
|
|
||||||
mtmd_input_chunks & chunks,
|
|
||||||
llama_pos pos0,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
int32_t n_batch) {
|
|
||||||
int32_t ret;
|
|
||||||
llama_pos n_past = pos0;
|
|
||||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
|
||||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
|
||||||
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
|
|
||||||
|
|
||||||
for (auto & chunk : chunks) {
|
|
||||||
bool is_last = &chunk == &chunks.back();
|
|
||||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
||||||
text_batch.n_tokens = chunk.tokens_text.size();
|
|
||||||
size_t i = 0;
|
|
||||||
while (i < chunk.tokens_text.size()) { // split into batches
|
|
||||||
for (; i < chunk.tokens_text.size() && text_batch.n_tokens < n_batch; i++) {
|
|
||||||
text_batch.token [i] = chunk.tokens_text[i];
|
|
||||||
text_batch.pos [i] = n_past++;
|
|
||||||
text_batch.n_seq_id[i] = 1;
|
|
||||||
text_batch.seq_id [i][0] = seq_id;
|
|
||||||
text_batch.logits [i] = false;
|
|
||||||
}
|
|
||||||
if (is_last) {
|
|
||||||
// always get logits for last input chunk
|
|
||||||
text_batch.logits[text_batch.n_tokens - 1] = true;
|
|
||||||
}
|
|
||||||
ret = llama_decode(lctx, text_batch);
|
|
||||||
if (ret != 0) {
|
|
||||||
LOG_ERR("failed to decode text\n");
|
|
||||||
llama_batch_free(text_batch);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
||||||
GGML_ASSERT(!is_last && "logits for last image chunk is not yet supported");
|
|
||||||
GGML_ASSERT(chunk.tokens_image != nullptr);
|
|
||||||
int64_t t0 = ggml_time_ms();
|
|
||||||
if (ctx->print_timings) {
|
|
||||||
LOG_INF("encoding image or slice...\n");
|
|
||||||
}
|
|
||||||
ret = mtmd_encode(ctx, chunk.tokens_image.get());
|
|
||||||
if (ret != 0) {
|
|
||||||
LOG_ERR("failed to encode image\n");
|
|
||||||
llama_batch_free(text_batch);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
if (ctx->print_timings) {
|
|
||||||
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
|
|
||||||
int32_t i_batch = 0;
|
|
||||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
|
||||||
float * embd = mtmd_get_output_embd(ctx);
|
|
||||||
decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
|
||||||
|
|
||||||
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
|
|
||||||
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
|
|
||||||
|
|
||||||
if (mtmd_decode_use_mrope(ctx)) {
|
|
||||||
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
|
|
||||||
} else {
|
|
||||||
batch_embd.set_position_normal(n_past, seq_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mtmd_decode_use_non_causal(ctx)) {
|
|
||||||
llama_set_causal_attn(lctx, false);
|
|
||||||
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
|
|
||||||
}
|
|
||||||
|
|
||||||
while (i_batch < n_img_batches) { // split into batches
|
|
||||||
int pos_offset = i_batch*n_batch;
|
|
||||||
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
|
|
||||||
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
|
|
||||||
|
|
||||||
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
|
|
||||||
|
|
||||||
int64_t t1 = ggml_time_ms();
|
|
||||||
ret = llama_decode(lctx, batch_embd_view);
|
|
||||||
if (ret != 0) {
|
|
||||||
LOG_ERR("failed to decode image\n");
|
|
||||||
llama_set_causal_attn(lctx, true); // restore causal attn
|
|
||||||
llama_batch_free(text_batch);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx->print_timings) {
|
|
||||||
LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
|
|
||||||
}
|
|
||||||
|
|
||||||
i_batch++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// for mrope, one image is one single **temporal** position
|
|
||||||
n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
|
|
||||||
|
|
||||||
if (mtmd_decode_use_non_causal(ctx)) {
|
|
||||||
llama_set_causal_attn(lctx, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false && "chunk type not supported");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_batch_free(text_batch);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output) {
|
|
||||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
|
||||||
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
|
|
||||||
if (!ok) {
|
|
||||||
LOG_ERR("Unable to load image from buffer\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
|
|
||||||
output.data.resize(output.nx * output.ny * 3);
|
|
||||||
std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
|
|
||||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
|
||||||
bool ok = clip_image_load_from_file(fname, img_u8.get());
|
|
||||||
if (!ok) {
|
|
||||||
LOG_ERR("Unable to load image %s\n", fname);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
|
|
||||||
output.data.resize(output.nx * output.ny * 3);
|
|
||||||
std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
|
||||||
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
|
||||||
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
|
||||||
return ctx->use_mrope;
|
|
||||||
}
|
|
||||||
|
|
||||||
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
|
|
||||||
mtmd_image_tokens_free(val);
|
|
||||||
}
|
|
@ -1,168 +0,0 @@
|
|||||||
#ifndef MTMD_H
|
|
||||||
#define MTMD_H
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "llama.h"
|
|
||||||
#include "clip.h"
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <cinttypes>
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
#ifdef LLAMA_SHARED
|
|
||||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
|
||||||
# ifdef LLAMA_BUILD
|
|
||||||
# define MTMD_API __declspec(dllexport)
|
|
||||||
# else
|
|
||||||
# define MTMD_API __declspec(dllimport)
|
|
||||||
# endif
|
|
||||||
# else
|
|
||||||
# define MTMD_API __attribute__ ((visibility ("default")))
|
|
||||||
# endif
|
|
||||||
#else
|
|
||||||
# define MTMD_API
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
|
|
||||||
enum mtmd_input_chunk_type {
|
|
||||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
||||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mtmd_context;
|
|
||||||
struct mtmd_image_tokens;
|
|
||||||
|
|
||||||
// represents raw image data, layout is RGBRGBRGB...
|
|
||||||
// length of data must be nx * ny * 3
|
|
||||||
struct mtmd_bitmap {
|
|
||||||
uint32_t nx;
|
|
||||||
uint32_t ny;
|
|
||||||
std::vector<unsigned char> data;
|
|
||||||
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mtmd_image_tokens_deleter {
|
|
||||||
void operator()(mtmd_image_tokens * val); // forward declaration
|
|
||||||
};
|
|
||||||
using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
|
|
||||||
|
|
||||||
struct mtmd_input_chunk {
|
|
||||||
mtmd_input_chunk_type type;
|
|
||||||
std::vector<llama_token> tokens_text;
|
|
||||||
mtmd_image_tokens_ptr tokens_image;
|
|
||||||
};
|
|
||||||
|
|
||||||
using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
|
|
||||||
|
|
||||||
struct mtmd_context_params {
|
|
||||||
bool use_gpu = true;
|
|
||||||
bool print_timings = true;
|
|
||||||
int n_threads = 4;
|
|
||||||
enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
|
|
||||||
const char * image_marker = "<__image__>";
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mtmd_input_text {
|
|
||||||
std::string text;
|
|
||||||
bool add_special;
|
|
||||||
bool parse_special;
|
|
||||||
};
|
|
||||||
|
|
||||||
// initialize the mtmd context
|
|
||||||
// return nullptr on failure
|
|
||||||
MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
|
||||||
const llama_model * text_model,
|
|
||||||
const mtmd_context_params ctx_params);
|
|
||||||
|
|
||||||
MTMD_API void mtmd_free(mtmd_context * ctx);
|
|
||||||
|
|
||||||
// tokenize an input text prompt and an image
|
|
||||||
// the prompt must have the input image marker (default: "<__image__>") in it
|
|
||||||
// the marker will be replaced with the image tokens
|
|
||||||
// for example:
|
|
||||||
// "here is an image: <__image__>\ndescribe it in detail."
|
|
||||||
// this will gives 3 chunks:
|
|
||||||
// 1. "here is an image: <start_of_image>"
|
|
||||||
// 2. (image tokens)
|
|
||||||
// 3. "<end_of_image>\ndescribe it in detail."
|
|
||||||
// number of bitmaps must be equal to the number of image markers in the prompt
|
|
||||||
// this function is thread-safe (shared ctx)
|
|
||||||
// return values:
|
|
||||||
// 0 on success
|
|
||||||
// 1 on number of images not matching the number of markers
|
|
||||||
// 2 on image preprocessing error
|
|
||||||
MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
|
|
||||||
std::vector<mtmd_input_chunk> & output,
|
|
||||||
const mtmd_input_text & text,
|
|
||||||
const std::vector<mtmd_bitmap> & bitmaps);
|
|
||||||
|
|
||||||
// access mtmd_image_tokens
|
|
||||||
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
|
|
||||||
MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
|
|
||||||
MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
|
|
||||||
MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
|
|
||||||
MTMD_API llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
|
|
||||||
MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
|
|
||||||
|
|
||||||
// returns 0 on success
|
|
||||||
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
|
|
||||||
const mtmd_image_tokens * image_tokens);
|
|
||||||
|
|
||||||
// get output embeddings from the last encode pass
|
|
||||||
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
|
||||||
|
|
||||||
// whether we need to set non-causal mask before llama_decode
|
|
||||||
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
|
|
||||||
|
|
||||||
// whether the current model use M-RoPE for llama_decode
|
|
||||||
MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//
|
|
||||||
// helper functions (can be implemented based on other functions)
|
|
||||||
//
|
|
||||||
|
|
||||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
|
||||||
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
|
|
||||||
|
|
||||||
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
|
|
||||||
MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks);
|
|
||||||
|
|
||||||
// helper function that automatically:
|
|
||||||
// 1. run llama_decode() on text chunks
|
|
||||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
|
||||||
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
|
||||||
// otherwise, returns 0 on success
|
|
||||||
MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
|
|
||||||
llama_context * lctx,
|
|
||||||
mtmd_input_chunks & chunks,
|
|
||||||
llama_pos pos0,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
int32_t n_batch);
|
|
||||||
|
|
||||||
// helper function to construct a mtmd_bitmap from a file
|
|
||||||
// returns 0 on success
|
|
||||||
// this function is thread-safe
|
|
||||||
MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
|
|
||||||
|
|
||||||
// helper function to construct a mtmd_bitmap from a buffer
|
|
||||||
// the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
|
|
||||||
// returns 0 on success
|
|
||||||
// this function is thread-safe
|
|
||||||
MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
|
|
||||||
|
|
||||||
// convenient unique_ptr wrappers
|
|
||||||
struct mtmd_context_deleter {
|
|
||||||
void operator()(mtmd_context * val) { mtmd_free(val); }
|
|
||||||
};
|
|
||||||
using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
static_assert(false && "C header is not yet supported by this library");
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,217 +0,0 @@
|
|||||||
import argparse
|
|
||||||
from typing import Dict, List, Optional
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import numpy as np
|
|
||||||
from gguf import *
|
|
||||||
from transformers import (
|
|
||||||
AutoProcessor,
|
|
||||||
Qwen2VLConfig,
|
|
||||||
Qwen2VLProcessor,
|
|
||||||
Qwen2VLForConditionalGeneration,
|
|
||||||
Qwen2_5_VLConfig, # type: ignore[reportAttributeAccessIssue]
|
|
||||||
Qwen2_5_VLForConditionalGeneration, # type: ignore[reportAttributeAccessIssue]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
VISION = "clip.vision"
|
|
||||||
|
|
||||||
|
|
||||||
def k(raw_key: str, arch: str) -> str:
|
|
||||||
return raw_key.format(arch=arch)
|
|
||||||
|
|
||||||
|
|
||||||
def get_n_wa_pattern(fullatt_block_indexes: Optional[List[int]]):
|
|
||||||
if fullatt_block_indexes is None:
|
|
||||||
return 0
|
|
||||||
n_wa = fullatt_block_indexes[0]
|
|
||||||
for a, b in zip(fullatt_block_indexes, fullatt_block_indexes[1:]):
|
|
||||||
if b - a - 1 != n_wa:
|
|
||||||
raise ValueError(
|
|
||||||
f"window/full attention layer should have fix pattern of "
|
|
||||||
f"for each full-attention layer followed by {n_wa} window-attention layers"
|
|
||||||
)
|
|
||||||
return n_wa + 1
|
|
||||||
|
|
||||||
|
|
||||||
class VL2:
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def to_gguf_name(name: str) -> str:
|
|
||||||
og = name
|
|
||||||
name = name.replace("text_model", "t").replace("vision_model", "v")
|
|
||||||
name = name.replace("blocks", "blk").replace("embeddings.", "")
|
|
||||||
name = name.replace("attn.", "attn_")
|
|
||||||
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
|
|
||||||
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
|
|
||||||
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
|
|
||||||
name = name.replace("merger.mlp", 'mm')
|
|
||||||
print(f"[to_gguf_name] {og} --> {name}")
|
|
||||||
return name
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def find_vision_tensors(cls, qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
|
||||||
vision_model = qwen2vl.visual
|
|
||||||
tensor_map = {}
|
|
||||||
for name, ten in vision_model.state_dict().items():
|
|
||||||
ten = ten.numpy()
|
|
||||||
if 'qkv' in name:
|
|
||||||
if ten.ndim == 2: # weight
|
|
||||||
c3, _ = ten.shape
|
|
||||||
else: # bias
|
|
||||||
c3 = ten.shape[0]
|
|
||||||
assert c3 % 3 == 0
|
|
||||||
c = c3 // 3
|
|
||||||
wq = ten[:c]
|
|
||||||
wk = ten[c: c * 2]
|
|
||||||
wv = ten[c * 2:]
|
|
||||||
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
|
|
||||||
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
|
|
||||||
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
|
|
||||||
elif 'merger' in name:
|
|
||||||
if name.endswith("ln_q.weight"):
|
|
||||||
tensor_map['v.post_ln.weight'] = ten
|
|
||||||
elif name.endswith("ln_q.bias"):
|
|
||||||
tensor_map['v.post_ln.bias'] = ten
|
|
||||||
else:
|
|
||||||
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
|
|
||||||
tensor_map[cls.to_gguf_name(name)] = ten
|
|
||||||
elif 'patch_embed.proj.weight' in name:
|
|
||||||
# NOTE: split Conv3D into Conv2Ds
|
|
||||||
c1, c2, kt, kh, kw = ten.shape
|
|
||||||
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
|
|
||||||
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
|
|
||||||
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
|
|
||||||
else:
|
|
||||||
tensor_map[cls.to_gguf_name(f"vision_model.{name}")] = ten
|
|
||||||
|
|
||||||
for new_name, ten in tensor_map.items():
|
|
||||||
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
|
|
||||||
tensor_map[new_name] = ten.astype(np.float32)
|
|
||||||
else:
|
|
||||||
tensor_map[new_name] = ten.astype(dtype)
|
|
||||||
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
|
|
||||||
return tensor_map
|
|
||||||
|
|
||||||
|
|
||||||
class VL25(VL2):
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def to_gguf_name(name: str) -> str:
|
|
||||||
og = name
|
|
||||||
name = name.replace("text_model", "t").replace("vision_model", "v")
|
|
||||||
name = name.replace("blocks", "blk").replace("embeddings.", "")
|
|
||||||
name = name.replace("attn.", "attn_")
|
|
||||||
name = name.replace("mlp.down_proj", "ffn_down").replace("mlp.up_proj", "ffn_up")
|
|
||||||
name = name.replace("mlp.gate_proj", "ffn_gate").replace("proj.", "out.")
|
|
||||||
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
|
|
||||||
name = name.replace("merger.mlp", 'mm')
|
|
||||||
print(f"[vl25][to_gguf_name] {og} --> {name}")
|
|
||||||
return name
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
if args.data_type == 'fp32':
|
|
||||||
dtype = torch.float32
|
|
||||||
np_dtype = np.float32
|
|
||||||
ftype = 0
|
|
||||||
elif args.data_type == 'fp16':
|
|
||||||
dtype = torch.float16
|
|
||||||
np_dtype = np.float16
|
|
||||||
ftype = 1
|
|
||||||
else:
|
|
||||||
raise ValueError()
|
|
||||||
|
|
||||||
local_model = False
|
|
||||||
model_path = ""
|
|
||||||
model_name = args.model_name
|
|
||||||
print("model_name: ", model_name)
|
|
||||||
if args.model_type == "qwen2vl":
|
|
||||||
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
||||||
model_name, torch_dtype=dtype, device_map="cpu"
|
|
||||||
)
|
|
||||||
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
|
||||||
vcfg = cfg.vision_config
|
|
||||||
else:
|
|
||||||
qwen2vl = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
||||||
model_name, torch_dtype=dtype, device_map="cpu"
|
|
||||||
)
|
|
||||||
cfg: Qwen2_5_VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
|
||||||
vcfg = cfg.vision_config
|
|
||||||
|
|
||||||
if os.path.isdir(model_name):
|
|
||||||
local_model = True
|
|
||||||
if model_name.endswith(os.sep):
|
|
||||||
model_name = model_name[:-1]
|
|
||||||
model_path = model_name
|
|
||||||
model_name = os.path.basename(model_name)
|
|
||||||
fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
|
|
||||||
|
|
||||||
fout = GGUFWriter(path=fname_out, arch="clip")
|
|
||||||
fout.add_description("image encoder for Qwen2VL")
|
|
||||||
|
|
||||||
fout.add_file_type(ftype)
|
|
||||||
fout.add_bool("clip.has_text_encoder", False)
|
|
||||||
fout.add_bool("clip.has_vision_encoder", True)
|
|
||||||
fout.add_bool("clip.has_qwen2vl_merger", True)
|
|
||||||
|
|
||||||
print(cfg.vision_config)
|
|
||||||
if 'silu' in cfg.vision_config.hidden_act.lower():
|
|
||||||
fout.add_bool("clip.use_silu", True)
|
|
||||||
fout.add_bool("clip.use_gelu", False)
|
|
||||||
elif 'gelu' in cfg.vision_config.hidden_act.lower():
|
|
||||||
fout.add_bool("clip.use_silu", False)
|
|
||||||
fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
|
|
||||||
else:
|
|
||||||
raise ValueError()
|
|
||||||
|
|
||||||
if args.model_type == "qwen2.5vl":
|
|
||||||
fout.add_uint32("clip.vision.n_wa_pattern", get_n_wa_pattern(vcfg.fullatt_block_indexes))
|
|
||||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size)
|
|
||||||
fout.add_uint32("clip.vision.projection_dim", vcfg.out_hidden_size)
|
|
||||||
fout.add_string("clip.projector_type", "qwen2.5vl_merger")
|
|
||||||
else:
|
|
||||||
fout.add_string("clip.projector_type", "qwen2vl_merger")
|
|
||||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
|
|
||||||
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
|
|
||||||
|
|
||||||
if args.model_type == "qwen2.5vl":
|
|
||||||
tensor_map = VL25.find_vision_tensors(qwen2vl, np_dtype)
|
|
||||||
else:
|
|
||||||
tensor_map = VL2.find_vision_tensors(qwen2vl, np_dtype)
|
|
||||||
for name, data in tensor_map.items():
|
|
||||||
fout.add_tensor(name, data)
|
|
||||||
|
|
||||||
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
|
|
||||||
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
|
|
||||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
|
|
||||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
|
||||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
|
|
||||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder
|
|
||||||
fout.add_name(model_name)
|
|
||||||
"""
|
|
||||||
HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
|
|
||||||
it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if local_model:
|
|
||||||
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path)
|
|
||||||
else:
|
|
||||||
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
|
|
||||||
fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
|
|
||||||
fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
|
|
||||||
|
|
||||||
fout.write_header_to_file()
|
|
||||||
fout.write_kv_data_to_file()
|
|
||||||
fout.write_tensors_to_file()
|
|
||||||
fout.close()
|
|
||||||
print("save model as: ", fname_out)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
|
|
||||||
parser.add_argument("--model_type", nargs='?', choices=['qwen2vl', 'qwen2.5vl'], default="qwen2vl")
|
|
||||||
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
|
|
||||||
args = parser.parse_args()
|
|
||||||
main(args)
|
|
@ -1,636 +0,0 @@
|
|||||||
#include "arg.h"
|
|
||||||
#include "base64.hpp"
|
|
||||||
#include "log.h"
|
|
||||||
#include "common.h"
|
|
||||||
#include "sampling.h"
|
|
||||||
#include "clip.h"
|
|
||||||
#include "llava.h"
|
|
||||||
#include "llama.h"
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
|
||||||
#include "ggml-cuda.h"
|
|
||||||
#endif
|
|
||||||
#ifdef NDEBUG
|
|
||||||
#include "ggml-alloc.h"
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cstring>
|
|
||||||
#include <vector>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <iostream>
|
|
||||||
#include <fstream>
|
|
||||||
#include <limits>
|
|
||||||
#include <cassert>
|
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
// THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
|
|
||||||
// IT IS NOT A PRODUCTION CODE
|
|
||||||
|
|
||||||
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
|
|
||||||
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
|
|
||||||
int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
|
|
||||||
const int patch_size = 14 * 2;
|
|
||||||
const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
|
|
||||||
const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
|
|
||||||
auto img_tokens = image_embed->n_image_pos;
|
|
||||||
// llama_pos mrope_pos[img_tokens * 4];
|
|
||||||
std::vector<llama_pos> mrope_pos;
|
|
||||||
mrope_pos.resize(img_tokens * 4);
|
|
||||||
|
|
||||||
for (int y = 0; y < ph; y++)
|
|
||||||
{
|
|
||||||
for (int x = 0; x < pw; x++)
|
|
||||||
{
|
|
||||||
int i = y * pw + x;
|
|
||||||
mrope_pos[i] = *st_pos_id;
|
|
||||||
mrope_pos[i + img_tokens] = *st_pos_id + y;
|
|
||||||
mrope_pos[i + img_tokens * 2] = *st_pos_id + x;
|
|
||||||
mrope_pos[i + img_tokens * 3] = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*st_pos_id += std::max(pw, ph);
|
|
||||||
|
|
||||||
int processed = 0;
|
|
||||||
std::vector<llama_pos> batch_mrope_pos;
|
|
||||||
batch_mrope_pos.resize(img_tokens * 4);
|
|
||||||
|
|
||||||
for (int i = 0; i < img_tokens; i += n_batch) {
|
|
||||||
int n_eval = img_tokens - i;
|
|
||||||
if (n_eval > n_batch) {
|
|
||||||
n_eval = n_batch;
|
|
||||||
}
|
|
||||||
|
|
||||||
// llama_pos batch_mrope_pos[n_eval * 4];
|
|
||||||
std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
|
|
||||||
memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
|
|
||||||
memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
|
|
||||||
memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
|
|
||||||
memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
|
|
||||||
|
|
||||||
llama_batch batch = {
|
|
||||||
int32_t(n_eval), // n_tokens
|
|
||||||
nullptr, // token
|
|
||||||
(image_embed->embed+i*n_embd), // embed
|
|
||||||
batch_mrope_pos.data(), // pos
|
|
||||||
nullptr, // n_seq_id
|
|
||||||
nullptr, // seq_id
|
|
||||||
nullptr, // logits
|
|
||||||
};
|
|
||||||
|
|
||||||
if (llama_decode(ctx_llama, batch)) {
|
|
||||||
LOG_ERR("%s : failed to eval\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
*n_past += n_eval;
|
|
||||||
processed += n_eval;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
|
|
||||||
int N = (int) tokens.size();
|
|
||||||
for (int i = 0; i < N; i += n_batch) {
|
|
||||||
int n_eval = (int) tokens.size() - i;
|
|
||||||
if (n_eval > n_batch) {
|
|
||||||
n_eval = n_batch;
|
|
||||||
}
|
|
||||||
auto batch = llama_batch_get_one(&tokens[i], n_eval);
|
|
||||||
|
|
||||||
if (llama_decode(ctx_llama, batch)) {
|
|
||||||
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
*n_past += n_eval;
|
|
||||||
*st_pos_id += n_eval;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) {
|
|
||||||
std::vector<llama_token> tokens;
|
|
||||||
tokens.push_back(id);
|
|
||||||
return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){
|
|
||||||
std::string str2 = str;
|
|
||||||
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
|
||||||
eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * sample(struct common_sampler * smpl,
|
|
||||||
struct llama_context * ctx_llama,
|
|
||||||
int * n_past, int * st_pos_id) {
|
|
||||||
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
|
||||||
common_sampler_accept(smpl, id, true);
|
|
||||||
|
|
||||||
const llama_model * model = llama_get_model(ctx_llama);
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
||||||
|
|
||||||
static std::string ret;
|
|
||||||
if (llama_vocab_is_eog(vocab, id)) {
|
|
||||||
ret = "</s>";
|
|
||||||
} else {
|
|
||||||
ret = common_token_to_piece(ctx_llama, id);
|
|
||||||
}
|
|
||||||
eval_id(ctx_llama, id, n_past, st_pos_id);
|
|
||||||
return ret.c_str();
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
|
|
||||||
static const char* IMG_BASE64_TAG_END = "\">";
|
|
||||||
|
|
||||||
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
|
|
||||||
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
|
|
||||||
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool prompt_contains_image(const std::string& prompt) {
|
|
||||||
size_t begin, end;
|
|
||||||
find_image_tag_in_prompt(prompt, begin, end);
|
|
||||||
return (begin != std::string::npos);
|
|
||||||
}
|
|
||||||
|
|
||||||
// replaces the base64 image tag in the prompt with `replacement`
|
|
||||||
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
|
|
||||||
size_t img_base64_str_start, img_base64_str_end;
|
|
||||||
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
|
||||||
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
|
||||||
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
|
|
||||||
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
|
|
||||||
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
|
|
||||||
|
|
||||||
auto required_bytes = base64::required_encode_size(base64_str.size());
|
|
||||||
auto img_bytes = std::vector<unsigned char>(required_bytes);
|
|
||||||
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
|
||||||
|
|
||||||
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
|
||||||
if (!embed) {
|
|
||||||
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return embed;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
|
|
||||||
size_t begin, end;
|
|
||||||
find_image_tag_in_prompt(prompt, begin, end);
|
|
||||||
if (begin == std::string::npos || end == std::string::npos) {
|
|
||||||
return prompt;
|
|
||||||
}
|
|
||||||
auto pre = prompt.substr(0, begin);
|
|
||||||
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
|
|
||||||
return pre + replacement + post;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llava_context {
|
|
||||||
struct clip_ctx * ctx_clip = NULL;
|
|
||||||
struct llama_context * ctx_llama = NULL;
|
|
||||||
struct llama_model * model = NULL;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void print_usage(int, char ** argv) {
|
|
||||||
LOG("\n example usage:\n");
|
|
||||||
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
|
||||||
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
|
|
||||||
|
|
||||||
// load and preprocess the image
|
|
||||||
llava_image_embed * embed = NULL;
|
|
||||||
auto prompt = params->prompt;
|
|
||||||
if (prompt_contains_image(prompt)) {
|
|
||||||
if (!params->image.empty()) {
|
|
||||||
LOG_INF("using base64 encoded image instead of command line image path\n");
|
|
||||||
}
|
|
||||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
|
||||||
if (!embed) {
|
|
||||||
LOG_ERR("%s: can't load image from prompt\n", __func__);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
params->prompt = remove_image_from_prompt(prompt);
|
|
||||||
} else {
|
|
||||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
|
||||||
if (!embed) {
|
|
||||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return embed;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
|
|
||||||
int n_past = 0;
|
|
||||||
int cur_pos_id = 0;
|
|
||||||
|
|
||||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
|
||||||
|
|
||||||
std::string system_prompt, user_prompt;
|
|
||||||
size_t image_pos = prompt.find("<|vision_start|>");
|
|
||||||
if (image_pos != std::string::npos) {
|
|
||||||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
|
||||||
system_prompt = prompt.substr(0, image_pos);
|
|
||||||
user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length());
|
|
||||||
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
|
||||||
if (params->verbose_prompt) {
|
|
||||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
||||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
|
||||||
if (params->verbose_prompt) {
|
|
||||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
||||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// llava-1.5 native mode
|
|
||||||
system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>";
|
|
||||||
user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n";
|
|
||||||
if (params->verbose_prompt) {
|
|
||||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
||||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true);
|
|
||||||
if (image_embed != nullptr) {
|
|
||||||
auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip);
|
|
||||||
qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size);
|
|
||||||
}
|
|
||||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false);
|
|
||||||
|
|
||||||
// generate the response
|
|
||||||
|
|
||||||
LOG("\n");
|
|
||||||
|
|
||||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
|
||||||
if (!smpl) {
|
|
||||||
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string response = "";
|
|
||||||
for (int i = 0; i < max_tgt_len; i++) {
|
|
||||||
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id);
|
|
||||||
response += tmp;
|
|
||||||
if (strcmp(tmp, "</s>") == 0) break;
|
|
||||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
|
||||||
LOG("%s", tmp);
|
|
||||||
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
|
||||||
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
|
||||||
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
|
||||||
|
|
||||||
fflush(stdout);
|
|
||||||
}
|
|
||||||
|
|
||||||
common_sampler_free(smpl);
|
|
||||||
LOG("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct llama_model * llava_init(common_params * params) {
|
|
||||||
llama_backend_init();
|
|
||||||
llama_numa_init(params->numa);
|
|
||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
|
||||||
if (model == NULL) {
|
|
||||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
return model;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
|
||||||
const char * clip_path = params->mmproj.path.c_str();
|
|
||||||
|
|
||||||
auto prompt = params->prompt;
|
|
||||||
if (prompt.empty()) {
|
|
||||||
prompt = "describe the image in detail.";
|
|
||||||
}
|
|
||||||
|
|
||||||
auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
|
|
||||||
|
|
||||||
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
|
||||||
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
|
||||||
|
|
||||||
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
|
|
||||||
|
|
||||||
if (ctx_llama == NULL) {
|
|
||||||
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
|
||||||
|
|
||||||
ctx_llava->ctx_llama = ctx_llama;
|
|
||||||
ctx_llava->ctx_clip = ctx_clip;
|
|
||||||
ctx_llava->model = model;
|
|
||||||
return ctx_llava;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void llava_free(struct llava_context * ctx_llava) {
|
|
||||||
if (ctx_llava->ctx_clip) {
|
|
||||||
clip_free(ctx_llava->ctx_clip);
|
|
||||||
ctx_llava->ctx_clip = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_free(ctx_llava->ctx_llama);
|
|
||||||
llama_model_free(ctx_llava->model);
|
|
||||||
llama_backend_free();
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef NDEBUG
|
|
||||||
|
|
||||||
static void debug_test_mrope_2d() {
|
|
||||||
// 1. Initialize backend
|
|
||||||
ggml_backend_t backend = NULL;
|
|
||||||
std::string backend_name = "";
|
|
||||||
// #ifdef GGML_USE_CUDA
|
|
||||||
// fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
|
||||||
// backend = ggml_backend_cuda_init(0); // init device 0
|
|
||||||
// backend_name = "cuda";
|
|
||||||
// if (!backend) {
|
|
||||||
// fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
|
||||||
// }
|
|
||||||
// #endif
|
|
||||||
// if there aren't GPU Backends fallback to CPU backend
|
|
||||||
if (!backend) {
|
|
||||||
backend = ggml_backend_cpu_init();
|
|
||||||
backend_name = "cpu";
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate the size needed to allocate
|
|
||||||
size_t ctx_size = 0;
|
|
||||||
ctx_size += 2 * ggml_tensor_overhead(); // tensors
|
|
||||||
// no need to allocate anything else!
|
|
||||||
|
|
||||||
// 2. Allocate `ggml_context` to store tensor data
|
|
||||||
struct ggml_init_params params = {
|
|
||||||
/*.mem_size =*/ ctx_size,
|
|
||||||
/*.mem_buffer =*/ NULL,
|
|
||||||
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
|
|
||||||
};
|
|
||||||
struct ggml_context * ctx = ggml_init(params);
|
|
||||||
|
|
||||||
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30);
|
|
||||||
ggml_set_name(inp_raw, "inp_raw");
|
|
||||||
ggml_set_input(inp_raw);
|
|
||||||
|
|
||||||
struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4);
|
|
||||||
ggml_set_name(pos, "pos");
|
|
||||||
ggml_set_input(pos);
|
|
||||||
|
|
||||||
std::vector<float> dummy_q;
|
|
||||||
dummy_q.resize(128 * 12 * 30);
|
|
||||||
std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
|
|
||||||
// memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
|
|
||||||
|
|
||||||
std::vector<int> pos_id;
|
|
||||||
pos_id.resize(30 * 4);
|
|
||||||
for (int i = 0; i < 30; i ++) {
|
|
||||||
pos_id[i] = i;
|
|
||||||
pos_id[i + 30] = i + 10;
|
|
||||||
pos_id[i + 60] = i + 20;
|
|
||||||
pos_id[i + 90] = i + 30;
|
|
||||||
}
|
|
||||||
int sections[4] = {32, 32, 0, 0};
|
|
||||||
|
|
||||||
// 4. Allocate a `ggml_backend_buffer` to store all tensors
|
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
|
||||||
|
|
||||||
// 5. Copy tensor data from main memory (RAM) to backend buffer
|
|
||||||
ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
|
|
||||||
ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
|
|
||||||
|
|
||||||
// 6. Create a `ggml_cgraph` for mul_mat operation
|
|
||||||
struct ggml_cgraph * gf = NULL;
|
|
||||||
struct ggml_context * ctx_cgraph = NULL;
|
|
||||||
|
|
||||||
// create a temporally context to build the graph
|
|
||||||
struct ggml_init_params params0 = {
|
|
||||||
/*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
|
|
||||||
/*.mem_buffer =*/ NULL,
|
|
||||||
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
|
|
||||||
};
|
|
||||||
ctx_cgraph = ggml_init(params0);
|
|
||||||
gf = ggml_new_graph(ctx_cgraph);
|
|
||||||
|
|
||||||
struct ggml_tensor * result0 = ggml_rope_multi(
|
|
||||||
ctx_cgraph, inp_raw, pos, nullptr,
|
|
||||||
128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1,
|
|
||||||
0, 1, 32, 1);
|
|
||||||
|
|
||||||
// Add "result" tensor and all of its dependencies to the cgraph
|
|
||||||
ggml_build_forward_expand(gf, result0);
|
|
||||||
|
|
||||||
// 7. Create a `ggml_gallocr` for cgraph computation
|
|
||||||
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
|
|
||||||
ggml_gallocr_alloc_graph(allocr, gf);
|
|
||||||
|
|
||||||
// 9. Run the computation
|
|
||||||
int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
|
|
||||||
if (ggml_backend_is_cpu(backend)) {
|
|
||||||
ggml_backend_cpu_set_n_threads(backend, n_threads);
|
|
||||||
}
|
|
||||||
ggml_backend_graph_compute(backend, gf);
|
|
||||||
|
|
||||||
// 10. Retrieve results (output tensors)
|
|
||||||
// in this example, output tensor is always the last tensor in the graph
|
|
||||||
struct ggml_tensor * result = result0;
|
|
||||||
// struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
|
|
||||||
float * result_data = (float *)malloc(ggml_nbytes(result));
|
|
||||||
// because the tensor data is stored in device buffer, we need to copy it back to RAM
|
|
||||||
ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
|
|
||||||
const std::string bin_file = "mrope_2d_" + backend_name +".bin";
|
|
||||||
std::ofstream outFile(bin_file, std::ios::binary);
|
|
||||||
|
|
||||||
if (outFile.is_open()) {
|
|
||||||
outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
|
|
||||||
outFile.close();
|
|
||||||
std::cout << "Data successfully written to " + bin_file << std::endl;
|
|
||||||
} else {
|
|
||||||
std::cerr << "Error opening file!" << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
free(result_data);
|
|
||||||
// 11. Free memory and exit
|
|
||||||
ggml_free(ctx_cgraph);
|
|
||||||
ggml_gallocr_free(allocr);
|
|
||||||
ggml_free(ctx);
|
|
||||||
ggml_backend_buffer_free(buffer);
|
|
||||||
ggml_backend_free(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
enum model_output_type {
|
|
||||||
conv3d,
|
|
||||||
patch_embed,
|
|
||||||
patch_win_attn_scatter,
|
|
||||||
first_attn_layer,
|
|
||||||
last_attn_layer,
|
|
||||||
attn_softmax,
|
|
||||||
final_layer,
|
|
||||||
};
|
|
||||||
|
|
||||||
static void debug_dump_img_embed(struct llava_context * ctx_llava, model_output_type output_type) {
|
|
||||||
constexpr int ih = 140;
|
|
||||||
constexpr int iw = 196;
|
|
||||||
// constexpr int ih = 56;
|
|
||||||
// constexpr int iw = 56;
|
|
||||||
// int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
|
|
||||||
int n_embd = 1280;
|
|
||||||
int merge = 1;
|
|
||||||
if (output_type == model_output_type::final_layer) {
|
|
||||||
n_embd = 2048;
|
|
||||||
merge = 2;
|
|
||||||
}
|
|
||||||
else if (output_type == model_output_type::attn_softmax) {
|
|
||||||
merge = 1;
|
|
||||||
n_embd = (ih/14/merge) * (iw/14/merge) * 16;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ne = (ih/14/merge) * (iw/14/merge) * n_embd;
|
|
||||||
float vals[iw * ih * 3];
|
|
||||||
// float embd[ne];
|
|
||||||
std::vector<float> embd;
|
|
||||||
embd.resize(ne);
|
|
||||||
|
|
||||||
for (int i = 0; i < iw*ih; i++)
|
|
||||||
{
|
|
||||||
for (int c = 0; c < 3; c++)
|
|
||||||
vals[i * 3 + c] = (float)i / (iw*ih);
|
|
||||||
}
|
|
||||||
|
|
||||||
clip_encode_float_image(ctx_llava->ctx_clip, 8, vals, ih, iw, embd.data());
|
|
||||||
|
|
||||||
std::string file_postfix = "";
|
|
||||||
switch (output_type)
|
|
||||||
{
|
|
||||||
case model_output_type::conv3d:
|
|
||||||
file_postfix = "conv3d";
|
|
||||||
break;
|
|
||||||
case model_output_type::patch_embed:
|
|
||||||
file_postfix = "patch_embed";
|
|
||||||
break;
|
|
||||||
case model_output_type::patch_win_attn_scatter:
|
|
||||||
file_postfix = "scatter";
|
|
||||||
break;
|
|
||||||
case model_output_type::first_attn_layer:
|
|
||||||
file_postfix = "first_attn";
|
|
||||||
break;
|
|
||||||
case model_output_type::last_attn_layer:
|
|
||||||
file_postfix = "last_attn";
|
|
||||||
break;
|
|
||||||
case model_output_type::attn_softmax:
|
|
||||||
file_postfix = "attn_softmax";
|
|
||||||
break;
|
|
||||||
case model_output_type::final_layer:
|
|
||||||
file_postfix = "final";
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
auto output_path = "img_embed_" + file_postfix + ".bin";
|
|
||||||
|
|
||||||
std::ofstream outFile(output_path, std::ios::binary);
|
|
||||||
if (outFile.is_open()) {
|
|
||||||
outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
|
|
||||||
|
|
||||||
outFile.close();
|
|
||||||
std::cout << "Data successfully written to ::[ " << output_path << std::endl;
|
|
||||||
} else {
|
|
||||||
std::cerr << "Error opening file!" << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
|
||||||
ggml_time_init();
|
|
||||||
|
|
||||||
common_params params;
|
|
||||||
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
common_init();
|
|
||||||
|
|
||||||
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
|
||||||
print_usage(argc, argv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto * model = llava_init(¶ms);
|
|
||||||
if (model == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (prompt_contains_image(params.prompt)) {
|
|
||||||
auto * ctx_llava = llava_init_context(¶ms, model);
|
|
||||||
|
|
||||||
auto * image_embed = load_image(ctx_llava, ¶ms, "");
|
|
||||||
|
|
||||||
// process the prompt
|
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
|
||||||
|
|
||||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
|
||||||
llava_image_embed_free(image_embed);
|
|
||||||
ctx_llava->model = NULL;
|
|
||||||
llava_free(ctx_llava);
|
|
||||||
#ifndef NDEBUG
|
|
||||||
} else if (params.image[0].empty()) {
|
|
||||||
auto ctx_llava = llava_init_context(¶ms, model);
|
|
||||||
|
|
||||||
// debug_test_mrope_2d();
|
|
||||||
debug_dump_img_embed(ctx_llava, model_output_type::final_layer);
|
|
||||||
// debug_dump_img_embed(ctx_llava, model_output_type::last_attn_layer);
|
|
||||||
|
|
||||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
|
||||||
ctx_llava->model = NULL;
|
|
||||||
llava_free(ctx_llava);
|
|
||||||
#endif
|
|
||||||
} else {
|
|
||||||
for (auto & image : params.image) {
|
|
||||||
auto * ctx_llava = llava_init_context(¶ms, model);
|
|
||||||
|
|
||||||
auto * image_embed = load_image(ctx_llava, ¶ms, image);
|
|
||||||
if (!image_embed) {
|
|
||||||
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// process the prompt
|
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
|
||||||
|
|
||||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
|
||||||
llava_image_embed_free(image_embed);
|
|
||||||
ctx_llava->model = NULL;
|
|
||||||
llava_free(ctx_llava);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_model_free(model);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
@ -1,121 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# make sure we are in the right directory
|
|
||||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
|
||||||
cd $SCRIPT_DIR
|
|
||||||
|
|
||||||
#export LLAMA_CACHE="$SCRIPT_DIR/tmp"
|
|
||||||
|
|
||||||
set -eux
|
|
||||||
|
|
||||||
mkdir -p $SCRIPT_DIR/output
|
|
||||||
|
|
||||||
PROJ_ROOT="$SCRIPT_DIR/../.."
|
|
||||||
cd $PROJ_ROOT
|
|
||||||
|
|
||||||
# Check if the first argument is "big", then run test with big models
|
|
||||||
# This is useful if we're running the script on a larger machine, so we can test the big models
|
|
||||||
RUN_BIG_TESTS=false
|
|
||||||
if [ "${1:-}" = "big" ]; then
|
|
||||||
RUN_BIG_TESTS=true
|
|
||||||
echo "Include BIG models..."
|
|
||||||
fi
|
|
||||||
|
|
||||||
###############
|
|
||||||
|
|
||||||
arr_bin=()
|
|
||||||
arr_hf=()
|
|
||||||
arr_tmpl=() # chat template
|
|
||||||
|
|
||||||
add_test() {
|
|
||||||
local bin=$1
|
|
||||||
local hf=$2
|
|
||||||
local tmpl=${3:-""} # default to empty string if not provided
|
|
||||||
arr_bin+=("$bin")
|
|
||||||
arr_hf+=("$hf")
|
|
||||||
arr_tmpl+=("$tmpl")
|
|
||||||
}
|
|
||||||
|
|
||||||
add_test_big() {
|
|
||||||
if [ "$RUN_BIG_TESTS" = true ]; then
|
|
||||||
add_test "$@"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
add_test "llama-mtmd-cli" "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
|
|
||||||
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
|
|
||||||
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
|
|
||||||
add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
|
||||||
add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
|
|
||||||
add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
|
||||||
add_test "llama-mtmd-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
|
||||||
add_test "llama-mtmd-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K" "vicuna"
|
|
||||||
add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
|
||||||
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
|
||||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
|
||||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
|
||||||
add_test "llama-mtmd-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
|
||||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
|
||||||
|
|
||||||
# to test the big models, run: ./tests.sh big
|
|
||||||
add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
|
||||||
add_test_big "llama-mtmd-cli" "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
|
|
||||||
|
|
||||||
# these models always give the wrong answer, not sure why
|
|
||||||
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
|
|
||||||
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
|
|
||||||
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
|
|
||||||
|
|
||||||
# this model has broken chat template, not usable
|
|
||||||
# add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
|
|
||||||
|
|
||||||
###############
|
|
||||||
|
|
||||||
cmake --build build -j --target "${arr_bin[@]}"
|
|
||||||
|
|
||||||
arr_res=()
|
|
||||||
|
|
||||||
for i in "${!arr_bin[@]}"; do
|
|
||||||
bin="${arr_bin[$i]}"
|
|
||||||
hf="${arr_hf[$i]}"
|
|
||||||
tmpl="${arr_tmpl[$i]}"
|
|
||||||
|
|
||||||
echo "Running test with binary: $bin and HF model: $hf"
|
|
||||||
echo ""
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
output=$(\
|
|
||||||
"$PROJ_ROOT/build/bin/$bin" \
|
|
||||||
-hf "$hf" \
|
|
||||||
--image $SCRIPT_DIR/test-1.jpeg \
|
|
||||||
-p "what is the publisher name of the newspaper?" \
|
|
||||||
--temp 0 -n 128 \
|
|
||||||
${tmpl:+--chat-template "$tmpl"} \
|
|
||||||
2>&1 | tee /dev/tty)
|
|
||||||
|
|
||||||
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
|
||||||
|
|
||||||
if echo "$output" | grep -iq "new york"; then
|
|
||||||
result="\033[32mOK\033[0m: $bin $hf"
|
|
||||||
else
|
|
||||||
result="\033[31mFAIL\033[0m: $bin $hf"
|
|
||||||
fi
|
|
||||||
echo -e "$result"
|
|
||||||
arr_res+=("$result")
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo ""
|
|
||||||
echo ""
|
|
||||||
echo "#################################################"
|
|
||||||
echo "#################################################"
|
|
||||||
echo ""
|
|
||||||
echo ""
|
|
||||||
done
|
|
||||||
|
|
||||||
set +x
|
|
||||||
|
|
||||||
for i in "${!arr_res[@]}"; do
|
|
||||||
echo -e "${arr_res[$i]}"
|
|
||||||
done
|
|
||||||
echo ""
|
|
||||||
echo "Output logs are saved in $SCRIPT_DIR/output"
|
|
@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
|
|||||||
const int N = 5; // n-gram size
|
const int N = 5; // n-gram size
|
||||||
const int G = 15; // max verification n-grams
|
const int G = 15; // max verification n-grams
|
||||||
|
|
||||||
const bool dump_kv_cache = params.dump_kv_cache;
|
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -62,6 +60,8 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_init.model.get();
|
llama_model * model = llama_init.model.get();
|
||||||
llama_context * ctx = llama_init.context.get();
|
llama_context * ctx = llama_init.context.get();
|
||||||
|
|
||||||
|
auto * mem = llama_get_memory(ctx);
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
// Tokenize the prompt
|
// Tokenize the prompt
|
||||||
@ -96,7 +96,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
|
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
|
||||||
|
|
||||||
for (int s = 1; s < W + G + 1; ++s) {
|
for (int s = 1; s < W + G + 1; ++s) {
|
||||||
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
|
llama_memory_seq_cp(mem, 0, s, -1, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto t_enc_end = ggml_time_us();
|
const auto t_enc_end = ggml_time_us();
|
||||||
@ -152,9 +152,6 @@ int main(int argc, char ** argv) {
|
|||||||
// here we keep adding new n-grams as we go
|
// here we keep adding new n-grams as we go
|
||||||
ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
|
ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
|
||||||
|
|
||||||
// debug
|
|
||||||
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
|
|
||||||
|
|
||||||
const auto t_dec_start = ggml_time_us();
|
const auto t_dec_start = ggml_time_us();
|
||||||
|
|
||||||
// sample first token
|
// sample first token
|
||||||
@ -172,12 +169,6 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// debug
|
|
||||||
if (dump_kv_cache) {
|
|
||||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
||||||
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
||||||
}
|
|
||||||
|
|
||||||
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||||
//
|
//
|
||||||
// Example for W = 5, N = 4, G = 2:
|
// Example for W = 5, N = 4, G = 2:
|
||||||
@ -438,17 +429,17 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// KV cache management
|
// KV cache management
|
||||||
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
|
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
|
||||||
llama_kv_self_seq_rm(ctx, -1, n_past, -1);
|
llama_memory_seq_rm(mem, -1, n_past, -1);
|
||||||
|
|
||||||
if (seq_id_best != 0) {
|
if (seq_id_best != 0) {
|
||||||
// if a verification token matched, we keep the best sequence and remove the rest
|
// if a verification token matched, we keep the best sequence and remove the rest
|
||||||
// this leads to some KV cache fragmentation
|
// this leads to some KV cache fragmentation
|
||||||
llama_kv_self_seq_keep(ctx, seq_id_best);
|
llama_memory_seq_keep(mem, seq_id_best);
|
||||||
llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1);
|
llama_memory_seq_cp (mem, seq_id_best, 0, -1, -1);
|
||||||
llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1);
|
llama_memory_seq_rm (mem, seq_id_best, -1, -1);
|
||||||
|
|
||||||
for (int s = 1; s < W + G + 1; ++s) {
|
for (int s = 1; s < W + G + 1; ++s) {
|
||||||
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
|
llama_memory_seq_cp(mem, 0, s, -1, -1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -473,8 +464,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
common_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
|
|
||||||
llama_kv_cache_view_free(&kvc_view);
|
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
@ -24,8 +24,6 @@ int main(int argc, char ** argv){
|
|||||||
// max. number of additional tokens to draft if match is found
|
// max. number of additional tokens to draft if match is found
|
||||||
const int n_draft = params.speculative.n_max;
|
const int n_draft = params.speculative.n_max;
|
||||||
|
|
||||||
const bool dump_kv_cache = params.dump_kv_cache;
|
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -110,18 +108,9 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
|
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
|
||||||
|
|
||||||
// debug
|
|
||||||
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
|
|
||||||
|
|
||||||
const auto t_dec_start = ggml_time_us();
|
const auto t_dec_start = ggml_time_us();
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// debug
|
|
||||||
if (dump_kv_cache) {
|
|
||||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
||||||
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
||||||
}
|
|
||||||
|
|
||||||
// print current draft sequence
|
// print current draft sequence
|
||||||
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
|
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
|
||||||
|
|
||||||
@ -192,7 +181,7 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
// KV cache management
|
// KV cache management
|
||||||
// clean the cache of draft tokens that weren't accepted
|
// clean the cache of draft tokens that weren't accepted
|
||||||
llama_kv_self_seq_rm(ctx, 0, n_past, -1);
|
llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
|
||||||
|
|
||||||
common_batch_clear(batch_tgt);
|
common_batch_clear(batch_tgt);
|
||||||
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
||||||
|
@ -1,3 +1,14 @@
|
|||||||
# llama.cpp/example/parallel
|
# llama.cpp/example/parallel
|
||||||
|
|
||||||
Simplified simulation of serving incoming requests in parallel
|
Simplified simulation of serving incoming requests in parallel
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of up to 10 junk questions (`--junk 10`) followed by the actual question.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384
|
||||||
|
```
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> It's recommended to use base models with this example. Instruction tuned models might not be able to properly follow the custom chat template specified here, so the results might not be as expected.
|
||||||
|
@ -34,11 +34,61 @@ static std::string k_system =
|
|||||||
R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
|
R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
|
||||||
The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
||||||
|
|
||||||
User: Recommend a nice restaurant in the area.
|
User:
|
||||||
Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
|
Recommend a nice restaurant in the area.
|
||||||
User: Who is Richard Feynman?
|
Assistant:
|
||||||
Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
|
I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
|
||||||
User:)";
|
User:
|
||||||
|
Who is Richard Feynman?
|
||||||
|
Assistant:
|
||||||
|
Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
|
||||||
|
)";
|
||||||
|
|
||||||
|
static std::vector<std::string> k_questions = {
|
||||||
|
"What is the tallest mountain in the world?",
|
||||||
|
"Who was the first person to win two Nobel Prizes?",
|
||||||
|
"Which country invented paper?",
|
||||||
|
"What organ is primarily responsible for pumping blood throughout the body?",
|
||||||
|
"Which planet is known for its prominent ring system?",
|
||||||
|
"Who directed the movie 'Inception'?",
|
||||||
|
"What is the freezing point of water in Fahrenheit?",
|
||||||
|
"Which animal is known to have the longest lifespan?",
|
||||||
|
"What language has the most native speakers worldwide?",
|
||||||
|
"What is the capital city of Canada?",
|
||||||
|
"Who is credited with inventing the World Wide Web?",
|
||||||
|
"Which metal is liquid at room temperature?",
|
||||||
|
"What is the term for an animal that eats both plants and meat?",
|
||||||
|
"Who painted 'The Starry Night'?",
|
||||||
|
"What gas do humans exhale that plants use for photosynthesis?",
|
||||||
|
"What year did World War II end?",
|
||||||
|
"Which continent has the most countries?",
|
||||||
|
"Who wrote the novel 'Frankenstein'?",
|
||||||
|
"What does DNA stand for?",
|
||||||
|
"What is the main ingredient in traditional Japanese miso soup?"
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::vector<std::string> k_answers = {
|
||||||
|
"The tallest mountain in the world is Mount Everest.",
|
||||||
|
"Marie Curie was the first person to win two Nobel Prizes.",
|
||||||
|
"Paper was invented in China.",
|
||||||
|
"The heart is the organ responsible for pumping blood.",
|
||||||
|
"Saturn is known for its prominent ring system.",
|
||||||
|
"Christopher Nolan directed the movie 'Inception'.",
|
||||||
|
"The freezing point of water in Fahrenheit is 32°F.",
|
||||||
|
"The bowhead whale is known to have the longest lifespan among mammals.",
|
||||||
|
"Mandarin Chinese has the most native speakers in the world.",
|
||||||
|
"The capital city of Canada is Ottawa.",
|
||||||
|
"Tim Berners-Lee is credited with inventing the World Wide Web.",
|
||||||
|
"Mercury is the metal that is liquid at room temperature.",
|
||||||
|
"An animal that eats both plants and meat is called an omnivore.",
|
||||||
|
"'The Starry Night' was painted by Vincent van Gogh.",
|
||||||
|
"Humans exhale carbon dioxide, which plants use in photosynthesis.",
|
||||||
|
"World War II ended in 1945.",
|
||||||
|
"Africa is the continent with the most countries.",
|
||||||
|
"The novel 'Frankenstein' was written by Mary Shelley.",
|
||||||
|
"DNA stands for Deoxyribonucleic Acid.",
|
||||||
|
"The main ingredient in traditional Japanese miso soup is fermented soybean paste."
|
||||||
|
};
|
||||||
|
|
||||||
static std::vector<std::string> k_prompts = {
|
static std::vector<std::string> k_prompts = {
|
||||||
"What is the meaning of life?",
|
"What is the meaning of life?",
|
||||||
@ -49,7 +99,7 @@ static std::vector<std::string> k_prompts = {
|
|||||||
"What is the best way to learn a new language?",
|
"What is the best way to learn a new language?",
|
||||||
"How to get a job at Google?",
|
"How to get a job at Google?",
|
||||||
"If you could have any superpower, what would it be?",
|
"If you could have any superpower, what would it be?",
|
||||||
"I want to learn how to play the piano.",
|
"I want to learn how to play the piano. What would be the best way to do it?",
|
||||||
};
|
};
|
||||||
|
|
||||||
struct client {
|
struct client {
|
||||||
@ -68,6 +118,7 @@ struct client {
|
|||||||
int64_t t_start_prompt;
|
int64_t t_start_prompt;
|
||||||
int64_t t_start_gen;
|
int64_t t_start_gen;
|
||||||
|
|
||||||
|
int32_t n_past = 0;
|
||||||
int32_t n_prompt = 0;
|
int32_t n_prompt = 0;
|
||||||
int32_t n_decoded = 0;
|
int32_t n_decoded = 0;
|
||||||
int32_t i_batch = -1;
|
int32_t i_batch = -1;
|
||||||
@ -107,6 +158,7 @@ int main(int argc, char ** argv) {
|
|||||||
common_params params;
|
common_params params;
|
||||||
|
|
||||||
params.n_predict = 128;
|
params.n_predict = 128;
|
||||||
|
params.n_junk = 1;
|
||||||
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
||||||
return 1;
|
return 1;
|
||||||
@ -126,7 +178,11 @@ int main(int argc, char ** argv) {
|
|||||||
// insert new requests as soon as the previous one is done
|
// insert new requests as soon as the previous one is done
|
||||||
const bool cont_batching = params.cont_batching;
|
const bool cont_batching = params.cont_batching;
|
||||||
|
|
||||||
const bool dump_kv_cache = params.dump_kv_cache;
|
// is the system prompt shared in the cache
|
||||||
|
const bool is_sp_shared = params.is_pp_shared;
|
||||||
|
|
||||||
|
// extra text to insert in each client's prompt in order to make it larger
|
||||||
|
const int32_t n_junk = std::max(1, params.n_junk);
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
@ -138,6 +194,8 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_init.model.get();
|
llama_model * model = llama_init.model.get();
|
||||||
llama_context * ctx = llama_init.context.get();
|
llama_context * ctx = llama_init.context.get();
|
||||||
|
|
||||||
|
auto * mem = llama_get_memory(ctx);
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
// load the prompts from an external file if there are any
|
// load the prompts from an external file if there are any
|
||||||
@ -169,6 +227,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens_system;
|
std::vector<llama_token> tokens_system;
|
||||||
|
|
||||||
tokens_system = common_tokenize(ctx, k_system, true);
|
tokens_system = common_tokenize(ctx, k_system, true);
|
||||||
const int32_t n_tokens_system = tokens_system.size();
|
const int32_t n_tokens_system = tokens_system.size();
|
||||||
|
|
||||||
@ -182,15 +241,13 @@ int main(int argc, char ** argv) {
|
|||||||
int32_t n_total_gen = 0;
|
int32_t n_total_gen = 0;
|
||||||
int32_t n_cache_miss = 0;
|
int32_t n_cache_miss = 0;
|
||||||
|
|
||||||
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
|
|
||||||
|
|
||||||
const auto t_main_start = ggml_time_us();
|
const auto t_main_start = ggml_time_us();
|
||||||
|
|
||||||
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
|
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
|
||||||
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
{
|
if (is_sp_shared) {
|
||||||
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
|
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
|
||||||
|
|
||||||
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
||||||
@ -204,7 +261,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// assign the system KV cache to all parallel sequences
|
// assign the system KV cache to all parallel sequences
|
||||||
for (int32_t i = 1; i <= n_clients; ++i) {
|
for (int32_t i = 1; i <= n_clients; ++i) {
|
||||||
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
|
llama_memory_seq_cp(mem, 0, i, -1, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
@ -213,11 +270,6 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_INF("Processing requests ...\n\n");
|
LOG_INF("Processing requests ...\n\n");
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
if (dump_kv_cache) {
|
|
||||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
||||||
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
||||||
}
|
|
||||||
|
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
// decode any currently ongoing sequences
|
// decode any currently ongoing sequences
|
||||||
@ -228,7 +280,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
client.i_batch = batch.n_tokens;
|
client.i_batch = batch.n_tokens;
|
||||||
|
|
||||||
common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
|
common_batch_add(batch, client.sampled, client.n_past++, { client.id + 1 }, true);
|
||||||
|
|
||||||
client.n_decoded += 1;
|
client.n_decoded += 1;
|
||||||
}
|
}
|
||||||
@ -236,9 +288,9 @@ int main(int argc, char ** argv) {
|
|||||||
if (batch.n_tokens == 0) {
|
if (batch.n_tokens == 0) {
|
||||||
// all sequences have ended - clear the entire KV cache
|
// all sequences have ended - clear the entire KV cache
|
||||||
for (int i = 1; i <= n_clients; ++i) {
|
for (int i = 1; i <= n_clients; ++i) {
|
||||||
llama_kv_self_seq_rm(ctx, i, -1, -1);
|
llama_memory_seq_rm(mem, i, -1, -1);
|
||||||
// but keep the system prompt
|
// but keep the system prompt
|
||||||
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
|
llama_memory_seq_cp(mem, 0, i, -1, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INF("%s: clearing the KV cache\n", __func__);
|
LOG_INF("%s: clearing the KV cache\n", __func__);
|
||||||
@ -254,9 +306,26 @@ int main(int argc, char ** argv) {
|
|||||||
client.t_start_gen = 0;
|
client.t_start_gen = 0;
|
||||||
|
|
||||||
client.input = k_prompts[rand() % k_prompts.size()];
|
client.input = k_prompts[rand() % k_prompts.size()];
|
||||||
client.prompt = client.input + "\nAssistant:";
|
|
||||||
client.response = "";
|
client.response = "";
|
||||||
|
|
||||||
|
// construct the prompt:
|
||||||
|
// [system prompt] + [junk] + [user prompt]
|
||||||
|
client.n_past = 0;
|
||||||
|
client.prompt = "";
|
||||||
|
if (is_sp_shared) {
|
||||||
|
client.n_past = n_tokens_system;
|
||||||
|
} else {
|
||||||
|
client.prompt += k_system;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n_junk_cur = rand() % n_junk;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_junk_cur; ++i) {
|
||||||
|
const int r = rand() % k_questions.size();
|
||||||
|
client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n";
|
||||||
|
}
|
||||||
|
client.prompt += "User:\n" + client.input + "\nAssistant:\n";
|
||||||
|
|
||||||
common_sampler_reset(client.smpl);
|
common_sampler_reset(client.smpl);
|
||||||
|
|
||||||
// do not prepend BOS because we have a system prompt!
|
// do not prepend BOS because we have a system prompt!
|
||||||
@ -264,7 +333,7 @@ int main(int argc, char ** argv) {
|
|||||||
tokens_prompt = common_tokenize(ctx, client.prompt, false);
|
tokens_prompt = common_tokenize(ctx, client.prompt, false);
|
||||||
|
|
||||||
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
||||||
common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
|
common_batch_add(batch, tokens_prompt[i], client.n_past++, { client.id + 1 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// extract the logits only for the last token
|
// extract the logits only for the last token
|
||||||
@ -276,7 +345,7 @@ int main(int argc, char ** argv) {
|
|||||||
client.n_decoded = 0;
|
client.n_decoded = 0;
|
||||||
client.i_batch = batch.n_tokens - 1;
|
client.i_batch = batch.n_tokens - 1;
|
||||||
|
|
||||||
LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur);
|
||||||
|
|
||||||
g_seq_id += 1;
|
g_seq_id += 1;
|
||||||
|
|
||||||
@ -295,7 +364,9 @@ int main(int argc, char ** argv) {
|
|||||||
// process in chunks of params.n_batch
|
// process in chunks of params.n_batch
|
||||||
int32_t n_batch = params.n_batch;
|
int32_t n_batch = params.n_batch;
|
||||||
|
|
||||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
int32_t i_next = 0;
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
|
||||||
// experiment: process in powers of 2
|
// experiment: process in powers of 2
|
||||||
//if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
|
//if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
|
||||||
// n_batch /= 2;
|
// n_batch /= 2;
|
||||||
@ -303,7 +374,7 @@ int main(int argc, char ** argv) {
|
|||||||
// continue;
|
// continue;
|
||||||
//}
|
//}
|
||||||
|
|
||||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||||
|
|
||||||
llama_batch batch_view = {
|
llama_batch batch_view = {
|
||||||
n_tokens,
|
n_tokens,
|
||||||
@ -323,19 +394,24 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
LOG_WRN("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||||
|
|
||||||
n_cache_miss += 1;
|
n_cache_miss += 1;
|
||||||
|
|
||||||
// retry with half the batch size to try to find a free slot in the KV cache
|
// retry with half the batch size to try to find a free slot in the KV cache
|
||||||
n_batch /= 2;
|
n_batch /= 2;
|
||||||
i -= n_batch;
|
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
||||||
|
|
||||||
|
// move the head of the batch forward with the number of tokens we just processed
|
||||||
|
i_next = i + n_tokens;
|
||||||
|
|
||||||
|
// on successful decode, restore the original batch size
|
||||||
|
n_batch = params.n_batch;
|
||||||
|
|
||||||
for (auto & client : clients) {
|
for (auto & client : clients) {
|
||||||
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
|
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
|
||||||
continue;
|
continue;
|
||||||
@ -363,10 +439,9 @@ int main(int argc, char ** argv) {
|
|||||||
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
|
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
|
||||||
|
|
||||||
if (client.n_decoded > 2 &&
|
if (client.n_decoded > 2 &&
|
||||||
(llama_vocab_is_eog(vocab, id) ||
|
(llama_vocab_is_eog(vocab, id) ||
|
||||||
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
|
(params.n_predict > 0 && client.n_decoded >= params.n_predict) ||
|
||||||
client.response.find("User:") != std::string::npos ||
|
client.response.find("User:") != std::string::npos)) {
|
||||||
client.response.find('\n') != std::string::npos)) {
|
|
||||||
// basic reverse prompt
|
// basic reverse prompt
|
||||||
const size_t pos = client.response.find("User:");
|
const size_t pos = client.response.find("User:");
|
||||||
if (pos != std::string::npos) {
|
if (pos != std::string::npos) {
|
||||||
@ -374,8 +449,8 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
||||||
llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1);
|
llama_memory_seq_rm(mem, client.id + 1, -1, -1);
|
||||||
llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
|
llama_memory_seq_cp(mem, 0, client.id + 1, -1, -1);
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
|
@ -126,6 +126,8 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
|
auto * mem = llama_get_memory(ctx);
|
||||||
|
|
||||||
// fill the KV cache
|
// fill the KV cache
|
||||||
for (int i = 0; i < n_ctx; i += n_batch) {
|
for (int i = 0; i < n_ctx; i += n_batch) {
|
||||||
if (i > 0 && n_grp > 1) {
|
if (i > 0 && n_grp > 1) {
|
||||||
@ -133,11 +135,10 @@ int main(int argc, char ** argv) {
|
|||||||
const int ib = i/n_batch - 1;
|
const int ib = i/n_batch - 1;
|
||||||
const int bd = n_batch_grp*(n_grp - 1);
|
const int bd = n_batch_grp*(n_grp - 1);
|
||||||
|
|
||||||
llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
llama_memory_seq_add(mem, 0, n_past - n_batch, n_past, ib*bd);
|
||||||
llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
llama_memory_seq_div(mem, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||||
llama_kv_self_update (ctx);
|
|
||||||
|
|
||||||
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
n_past = llama_memory_seq_pos_max(mem, 0) + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
@ -167,12 +168,10 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
|
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||||
|
|
||||||
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
llama_memory_seq_rm (mem, 0, n_keep , n_keep + n_discard);
|
||||||
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
llama_memory_seq_add(mem, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||||
//llama_kv_self_defrag (ctx);
|
|
||||||
llama_kv_self_update (ctx);
|
|
||||||
|
|
||||||
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
n_past = llama_memory_seq_pos_max(mem, 0) + 1;
|
||||||
|
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
@ -198,12 +197,10 @@ int main(int argc, char ** argv) {
|
|||||||
if (n_discard > 0) {
|
if (n_discard > 0) {
|
||||||
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||||
|
|
||||||
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
llama_memory_seq_rm (mem, 0, n_keep , n_keep + n_discard);
|
||||||
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
llama_memory_seq_add(mem, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||||
//llama_kv_self_defrag (ctx);
|
|
||||||
llama_kv_self_update (ctx);
|
|
||||||
|
|
||||||
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
n_past = llama_memory_seq_pos_max(mem, 0) + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,7 +23,7 @@ def create_completion(host, prompt, gbnf_grammar):
|
|||||||
"""Calls the /completion API on llama-server.
|
"""Calls the /completion API on llama-server.
|
||||||
|
|
||||||
See
|
See
|
||||||
https://github.com/ggml-org/llama.cpp/tree/HEAD/examples/server#api-endpoints
|
https://github.com/ggml-org/llama.cpp/tree/HEAD/tools/server#api-endpoints
|
||||||
"""
|
"""
|
||||||
print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}")
|
print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}")
|
||||||
headers = {"Content-Type": "application/json"}
|
headers = {"Content-Type": "application/json"}
|
||||||
|
@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
static void batch_process(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
llama_kv_self_clear(ctx);
|
llama_memory_clear(llama_get_memory(ctx), false);
|
||||||
|
|
||||||
// run model
|
// run model
|
||||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||||
if (llama_decode(ctx, batch) < 0) {
|
if (llama_decode(ctx, batch) < 0) {
|
||||||
LOG_ERR("%s : failed to decode\n", __func__);
|
LOG_ERR("%s : failed to process\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < batch.n_tokens; i++) {
|
for (int i = 0; i < batch.n_tokens; i++) {
|
||||||
@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
|
|||||||
// encode if at capacity
|
// encode if at capacity
|
||||||
if (batch.n_tokens + n_toks > n_batch) {
|
if (batch.n_tokens + n_toks > n_batch) {
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_process(ctx, batch, out, s, n_embd);
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
p += s;
|
p += s;
|
||||||
s = 0;
|
s = 0;
|
||||||
@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// final batch
|
// final batch
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_process(ctx, batch, out, s, n_embd);
|
||||||
|
|
||||||
// save embeddings to chunks
|
// save embeddings to chunks
|
||||||
for (int i = 0; i < n_chunks; i++) {
|
for (int i = 0; i < n_chunks; i++) {
|
||||||
@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
|
|||||||
batch_add_seq(query_batch, query_tokens, 0);
|
batch_add_seq(query_batch, query_tokens, 0);
|
||||||
|
|
||||||
std::vector<float> query_emb(n_embd, 0);
|
std::vector<float> query_emb(n_embd, 0);
|
||||||
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
|
||||||
|
|
||||||
common_batch_clear(query_batch);
|
common_batch_clear(query_batch);
|
||||||
|
|
||||||
|
@ -196,7 +196,7 @@ int main(int argc, char ** argv) {
|
|||||||
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||||
|
|
||||||
// erase whole kv
|
// erase whole kv
|
||||||
llama_kv_self_clear(ctx3);
|
llama_memory_clear(llama_get_memory(ctx3), true);
|
||||||
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||||
|
|
||||||
// restore kv into seq 1
|
// restore kv into seq 1
|
||||||
|
Binary file not shown.
@ -1,296 +0,0 @@
|
|||||||
import { useEffect, useMemo, useState } from 'react';
|
|
||||||
import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
|
|
||||||
import ChatMessage from './ChatMessage';
|
|
||||||
import { CanvasType, Message, PendingMessage } from '../utils/types';
|
|
||||||
import { classNames, cleanCurrentUrl, throttle } from '../utils/misc';
|
|
||||||
import CanvasPyInterpreter from './CanvasPyInterpreter';
|
|
||||||
import StorageUtils from '../utils/storage';
|
|
||||||
import { useVSCodeContext } from '../utils/llama-vscode';
|
|
||||||
import { useChatTextarea, ChatTextareaApi } from './useChatTextarea.ts';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A message display is a message node with additional information for rendering.
|
|
||||||
* For example, siblings of the message node are stored as their last node (aka leaf node).
|
|
||||||
*/
|
|
||||||
export interface MessageDisplay {
|
|
||||||
msg: Message | PendingMessage;
|
|
||||||
siblingLeafNodeIds: Message['id'][];
|
|
||||||
siblingCurrIdx: number;
|
|
||||||
isPending?: boolean;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* If the current URL contains "?m=...", prefill the message input with the value.
|
|
||||||
* If the current URL contains "?q=...", prefill and SEND the message.
|
|
||||||
*/
|
|
||||||
const prefilledMsg = {
|
|
||||||
content() {
|
|
||||||
const url = new URL(window.location.href);
|
|
||||||
return url.searchParams.get('m') ?? url.searchParams.get('q') ?? '';
|
|
||||||
},
|
|
||||||
shouldSend() {
|
|
||||||
const url = new URL(window.location.href);
|
|
||||||
return url.searchParams.has('q');
|
|
||||||
},
|
|
||||||
clear() {
|
|
||||||
cleanCurrentUrl(['m', 'q']);
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
function getListMessageDisplay(
|
|
||||||
msgs: Readonly<Message[]>,
|
|
||||||
leafNodeId: Message['id']
|
|
||||||
): MessageDisplay[] {
|
|
||||||
const currNodes = StorageUtils.filterByLeafNodeId(msgs, leafNodeId, true);
|
|
||||||
const res: MessageDisplay[] = [];
|
|
||||||
const nodeMap = new Map<Message['id'], Message>();
|
|
||||||
for (const msg of msgs) {
|
|
||||||
nodeMap.set(msg.id, msg);
|
|
||||||
}
|
|
||||||
// find leaf node from a message node
|
|
||||||
const findLeafNode = (msgId: Message['id']): Message['id'] => {
|
|
||||||
let currNode: Message | undefined = nodeMap.get(msgId);
|
|
||||||
while (currNode) {
|
|
||||||
if (currNode.children.length === 0) break;
|
|
||||||
currNode = nodeMap.get(currNode.children.at(-1) ?? -1);
|
|
||||||
}
|
|
||||||
return currNode?.id ?? -1;
|
|
||||||
};
|
|
||||||
// traverse the current nodes
|
|
||||||
for (const msg of currNodes) {
|
|
||||||
const parentNode = nodeMap.get(msg.parent ?? -1);
|
|
||||||
if (!parentNode) continue;
|
|
||||||
const siblings = parentNode.children;
|
|
||||||
if (msg.type !== 'root') {
|
|
||||||
res.push({
|
|
||||||
msg,
|
|
||||||
siblingLeafNodeIds: siblings.map(findLeafNode),
|
|
||||||
siblingCurrIdx: siblings.indexOf(msg.id),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
const scrollToBottom = throttle(
|
|
||||||
(requiresNearBottom: boolean, delay: number = 80) => {
|
|
||||||
const mainScrollElem = document.getElementById('main-scroll');
|
|
||||||
if (!mainScrollElem) return;
|
|
||||||
const spaceToBottom =
|
|
||||||
mainScrollElem.scrollHeight -
|
|
||||||
mainScrollElem.scrollTop -
|
|
||||||
mainScrollElem.clientHeight;
|
|
||||||
if (!requiresNearBottom || spaceToBottom < 50) {
|
|
||||||
setTimeout(
|
|
||||||
() => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }),
|
|
||||||
delay
|
|
||||||
);
|
|
||||||
}
|
|
||||||
},
|
|
||||||
80
|
|
||||||
);
|
|
||||||
|
|
||||||
export default function ChatScreen() {
|
|
||||||
const {
|
|
||||||
viewingChat,
|
|
||||||
sendMessage,
|
|
||||||
isGenerating,
|
|
||||||
stopGenerating,
|
|
||||||
pendingMessages,
|
|
||||||
canvasData,
|
|
||||||
replaceMessageAndGenerate,
|
|
||||||
} = useAppContext();
|
|
||||||
|
|
||||||
const textarea: ChatTextareaApi = useChatTextarea(prefilledMsg.content());
|
|
||||||
|
|
||||||
const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
|
|
||||||
// TODO: improve this when we have "upload file" feature
|
|
||||||
const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined;
|
|
||||||
|
|
||||||
// keep track of leaf node for rendering
|
|
||||||
const [currNodeId, setCurrNodeId] = useState<number>(-1);
|
|
||||||
const messages: MessageDisplay[] = useMemo(() => {
|
|
||||||
if (!viewingChat) return [];
|
|
||||||
else return getListMessageDisplay(viewingChat.messages, currNodeId);
|
|
||||||
}, [currNodeId, viewingChat]);
|
|
||||||
|
|
||||||
const currConvId = viewingChat?.conv.id ?? null;
|
|
||||||
const pendingMsg: PendingMessage | undefined =
|
|
||||||
pendingMessages[currConvId ?? ''];
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
// reset to latest node when conversation changes
|
|
||||||
setCurrNodeId(-1);
|
|
||||||
// scroll to bottom when conversation changes
|
|
||||||
scrollToBottom(false, 1);
|
|
||||||
}, [currConvId]);
|
|
||||||
|
|
||||||
const onChunk: CallbackGeneratedChunk = (currLeafNodeId?: Message['id']) => {
|
|
||||||
if (currLeafNodeId) {
|
|
||||||
setCurrNodeId(currLeafNodeId);
|
|
||||||
}
|
|
||||||
scrollToBottom(true);
|
|
||||||
};
|
|
||||||
|
|
||||||
const sendNewMessage = async () => {
|
|
||||||
const lastInpMsg = textarea.value();
|
|
||||||
if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? ''))
|
|
||||||
return;
|
|
||||||
textarea.setValue('');
|
|
||||||
scrollToBottom(false);
|
|
||||||
setCurrNodeId(-1);
|
|
||||||
// get the last message node
|
|
||||||
const lastMsgNodeId = messages.at(-1)?.msg.id ?? null;
|
|
||||||
if (
|
|
||||||
!(await sendMessage(
|
|
||||||
currConvId,
|
|
||||||
lastMsgNodeId,
|
|
||||||
lastInpMsg,
|
|
||||||
currExtra,
|
|
||||||
onChunk
|
|
||||||
))
|
|
||||||
) {
|
|
||||||
// restore the input message if failed
|
|
||||||
textarea.setValue(lastInpMsg);
|
|
||||||
}
|
|
||||||
// OK
|
|
||||||
clearExtraContext();
|
|
||||||
};
|
|
||||||
|
|
||||||
const handleEditMessage = async (msg: Message, content: string) => {
|
|
||||||
if (!viewingChat) return;
|
|
||||||
setCurrNodeId(msg.id);
|
|
||||||
scrollToBottom(false);
|
|
||||||
await replaceMessageAndGenerate(
|
|
||||||
viewingChat.conv.id,
|
|
||||||
msg.parent,
|
|
||||||
content,
|
|
||||||
msg.extra,
|
|
||||||
onChunk
|
|
||||||
);
|
|
||||||
setCurrNodeId(-1);
|
|
||||||
scrollToBottom(false);
|
|
||||||
};
|
|
||||||
|
|
||||||
const handleRegenerateMessage = async (msg: Message) => {
|
|
||||||
if (!viewingChat) return;
|
|
||||||
setCurrNodeId(msg.parent);
|
|
||||||
scrollToBottom(false);
|
|
||||||
await replaceMessageAndGenerate(
|
|
||||||
viewingChat.conv.id,
|
|
||||||
msg.parent,
|
|
||||||
null,
|
|
||||||
msg.extra,
|
|
||||||
onChunk
|
|
||||||
);
|
|
||||||
setCurrNodeId(-1);
|
|
||||||
scrollToBottom(false);
|
|
||||||
};
|
|
||||||
|
|
||||||
const hasCanvas = !!canvasData;
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
if (prefilledMsg.shouldSend()) {
|
|
||||||
// send the prefilled message if needed
|
|
||||||
sendNewMessage();
|
|
||||||
} else {
|
|
||||||
// otherwise, focus on the input
|
|
||||||
textarea.focus();
|
|
||||||
}
|
|
||||||
prefilledMsg.clear();
|
|
||||||
// no need to keep track of sendNewMessage
|
|
||||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
|
||||||
}, [textarea.ref]);
|
|
||||||
|
|
||||||
// due to some timing issues of StorageUtils.appendMsg(), we need to make sure the pendingMsg is not duplicated upon rendering (i.e. appears once in the saved conversation and once in the pendingMsg)
|
|
||||||
const pendingMsgDisplay: MessageDisplay[] =
|
|
||||||
pendingMsg && messages.at(-1)?.msg.id !== pendingMsg.id
|
|
||||||
? [
|
|
||||||
{
|
|
||||||
msg: pendingMsg,
|
|
||||||
siblingLeafNodeIds: [],
|
|
||||||
siblingCurrIdx: 0,
|
|
||||||
isPending: true,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
: [];
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div
|
|
||||||
className={classNames({
|
|
||||||
'grid lg:gap-8 grow transition-[300ms]': true,
|
|
||||||
'grid-cols-[1fr_0fr] lg:grid-cols-[1fr_1fr]': hasCanvas, // adapted for mobile
|
|
||||||
'grid-cols-[1fr_0fr]': !hasCanvas,
|
|
||||||
})}
|
|
||||||
>
|
|
||||||
<div
|
|
||||||
className={classNames({
|
|
||||||
'flex flex-col w-full max-w-[900px] mx-auto': true,
|
|
||||||
'hidden lg:flex': hasCanvas, // adapted for mobile
|
|
||||||
flex: !hasCanvas,
|
|
||||||
})}
|
|
||||||
>
|
|
||||||
{/* chat messages */}
|
|
||||||
<div id="messages-list" className="grow">
|
|
||||||
<div className="mt-auto flex justify-center">
|
|
||||||
{/* placeholder to shift the message to the bottom */}
|
|
||||||
{viewingChat ? '' : 'Send a message to start'}
|
|
||||||
</div>
|
|
||||||
{[...messages, ...pendingMsgDisplay].map((msg) => (
|
|
||||||
<ChatMessage
|
|
||||||
key={msg.msg.id}
|
|
||||||
msg={msg.msg}
|
|
||||||
siblingLeafNodeIds={msg.siblingLeafNodeIds}
|
|
||||||
siblingCurrIdx={msg.siblingCurrIdx}
|
|
||||||
onRegenerateMessage={handleRegenerateMessage}
|
|
||||||
onEditMessage={handleEditMessage}
|
|
||||||
onChangeSibling={setCurrNodeId}
|
|
||||||
/>
|
|
||||||
))}
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{/* chat input */}
|
|
||||||
<div className="flex flex-row items-end pt-8 pb-6 sticky bottom-0 bg-base-100">
|
|
||||||
<textarea
|
|
||||||
// Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
|
|
||||||
// Large screens (lg:): Disable manual resize, apply max-height for autosize limit
|
|
||||||
className="textarea textarea-bordered w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
|
|
||||||
placeholder="Type a message (Shift+Enter to add a new line)"
|
|
||||||
ref={textarea.ref}
|
|
||||||
onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
|
|
||||||
onKeyDown={(e) => {
|
|
||||||
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
|
|
||||||
if (e.key === 'Enter' && !e.shiftKey) {
|
|
||||||
e.preventDefault();
|
|
||||||
sendNewMessage();
|
|
||||||
}
|
|
||||||
}}
|
|
||||||
id="msg-input"
|
|
||||||
dir="auto"
|
|
||||||
// Set a base height of 2 rows for mobile views
|
|
||||||
// On lg+ screens, the hook will calculate and set the initial height anyway
|
|
||||||
rows={2}
|
|
||||||
></textarea>
|
|
||||||
|
|
||||||
{isGenerating(currConvId ?? '') ? (
|
|
||||||
<button
|
|
||||||
className="btn btn-neutral ml-2"
|
|
||||||
onClick={() => stopGenerating(currConvId ?? '')}
|
|
||||||
>
|
|
||||||
Stop
|
|
||||||
</button>
|
|
||||||
) : (
|
|
||||||
<button className="btn btn-primary ml-2" onClick={sendNewMessage}>
|
|
||||||
Send
|
|
||||||
</button>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<div className="w-full sticky top-[7em] h-[calc(100vh-9em)]">
|
|
||||||
{canvasData?.type === CanvasType.PY_INTERPRETER && (
|
|
||||||
<CanvasPyInterpreter />
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
}
|
|
@ -1,178 +0,0 @@
|
|||||||
import { useEffect, useState } from 'react';
|
|
||||||
import StorageUtils from '../utils/storage';
|
|
||||||
import { useAppContext } from '../utils/app.context';
|
|
||||||
import { classNames } from '../utils/misc';
|
|
||||||
import daisyuiThemes from 'daisyui/theme/object';
|
|
||||||
import { THEMES } from '../Config';
|
|
||||||
import { useNavigate } from 'react-router';
|
|
||||||
|
|
||||||
export default function Header() {
|
|
||||||
const navigate = useNavigate();
|
|
||||||
const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
|
|
||||||
const { setShowSettings } = useAppContext();
|
|
||||||
|
|
||||||
const setTheme = (theme: string) => {
|
|
||||||
StorageUtils.setTheme(theme);
|
|
||||||
setSelectedTheme(theme);
|
|
||||||
};
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
document.body.setAttribute('data-theme', selectedTheme);
|
|
||||||
document.body.setAttribute(
|
|
||||||
'data-color-scheme',
|
|
||||||
daisyuiThemes[selectedTheme]?.['color-scheme'] ?? 'auto'
|
|
||||||
);
|
|
||||||
}, [selectedTheme]);
|
|
||||||
|
|
||||||
const { isGenerating, viewingChat } = useAppContext();
|
|
||||||
const isCurrConvGenerating = isGenerating(viewingChat?.conv.id ?? '');
|
|
||||||
|
|
||||||
const removeConversation = () => {
|
|
||||||
if (isCurrConvGenerating || !viewingChat) return;
|
|
||||||
const convId = viewingChat?.conv.id;
|
|
||||||
if (window.confirm('Are you sure to delete this conversation?')) {
|
|
||||||
StorageUtils.remove(convId);
|
|
||||||
navigate('/');
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const downloadConversation = () => {
|
|
||||||
if (isCurrConvGenerating || !viewingChat) return;
|
|
||||||
const convId = viewingChat?.conv.id;
|
|
||||||
const conversationJson = JSON.stringify(viewingChat, null, 2);
|
|
||||||
const blob = new Blob([conversationJson], { type: 'application/json' });
|
|
||||||
const url = URL.createObjectURL(blob);
|
|
||||||
const a = document.createElement('a');
|
|
||||||
a.href = url;
|
|
||||||
a.download = `conversation_${convId}.json`;
|
|
||||||
document.body.appendChild(a);
|
|
||||||
a.click();
|
|
||||||
document.body.removeChild(a);
|
|
||||||
URL.revokeObjectURL(url);
|
|
||||||
};
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div className="flex flex-row items-center pt-6 pb-6 sticky top-0 z-10 bg-base-100">
|
|
||||||
{/* open sidebar button */}
|
|
||||||
<label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
|
|
||||||
<svg
|
|
||||||
xmlns="http://www.w3.org/2000/svg"
|
|
||||||
width="16"
|
|
||||||
height="16"
|
|
||||||
fill="currentColor"
|
|
||||||
className="bi bi-list"
|
|
||||||
viewBox="0 0 16 16"
|
|
||||||
>
|
|
||||||
<path
|
|
||||||
fillRule="evenodd"
|
|
||||||
d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"
|
|
||||||
/>
|
|
||||||
</svg>
|
|
||||||
</label>
|
|
||||||
|
|
||||||
<div className="grow text-2xl font-bold ml-2">llama.cpp</div>
|
|
||||||
|
|
||||||
{/* action buttons (top right) */}
|
|
||||||
<div className="flex items-center">
|
|
||||||
{viewingChat && (
|
|
||||||
<div className="dropdown dropdown-end">
|
|
||||||
{/* "..." button */}
|
|
||||||
<button
|
|
||||||
tabIndex={0}
|
|
||||||
role="button"
|
|
||||||
className="btn m-1"
|
|
||||||
disabled={isCurrConvGenerating}
|
|
||||||
>
|
|
||||||
<svg
|
|
||||||
xmlns="http://www.w3.org/2000/svg"
|
|
||||||
width="16"
|
|
||||||
height="16"
|
|
||||||
fill="currentColor"
|
|
||||||
className="bi bi-three-dots-vertical"
|
|
||||||
viewBox="0 0 16 16"
|
|
||||||
>
|
|
||||||
<path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0" />
|
|
||||||
</svg>
|
|
||||||
</button>
|
|
||||||
{/* dropdown menu */}
|
|
||||||
<ul
|
|
||||||
tabIndex={0}
|
|
||||||
className="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow"
|
|
||||||
>
|
|
||||||
<li onClick={downloadConversation}>
|
|
||||||
<a>Download</a>
|
|
||||||
</li>
|
|
||||||
<li className="text-error" onClick={removeConversation}>
|
|
||||||
<a>Delete</a>
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
|
|
||||||
<div className="tooltip tooltip-bottom" data-tip="Settings">
|
|
||||||
<button className="btn" onClick={() => setShowSettings(true)}>
|
|
||||||
{/* settings button */}
|
|
||||||
<svg
|
|
||||||
xmlns="http://www.w3.org/2000/svg"
|
|
||||||
width="16"
|
|
||||||
height="16"
|
|
||||||
fill="currentColor"
|
|
||||||
className="bi bi-gear"
|
|
||||||
viewBox="0 0 16 16"
|
|
||||||
>
|
|
||||||
<path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0" />
|
|
||||||
<path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z" />
|
|
||||||
</svg>
|
|
||||||
</button>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{/* theme controller is copied from https://daisyui.com/components/theme-controller/ */}
|
|
||||||
<div className="tooltip tooltip-bottom" data-tip="Themes">
|
|
||||||
<div className="dropdown dropdown-end dropdown-bottom">
|
|
||||||
<div tabIndex={0} role="button" className="btn m-1">
|
|
||||||
<svg
|
|
||||||
xmlns="http://www.w3.org/2000/svg"
|
|
||||||
width="16"
|
|
||||||
height="16"
|
|
||||||
fill="currentColor"
|
|
||||||
className="bi bi-palette2"
|
|
||||||
viewBox="0 0 16 16"
|
|
||||||
>
|
|
||||||
<path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z" />
|
|
||||||
</svg>
|
|
||||||
</div>
|
|
||||||
<ul
|
|
||||||
tabIndex={0}
|
|
||||||
className="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto"
|
|
||||||
>
|
|
||||||
<li>
|
|
||||||
<button
|
|
||||||
className={classNames({
|
|
||||||
'btn btn-sm btn-block btn-ghost justify-start': true,
|
|
||||||
'btn-active': selectedTheme === 'auto',
|
|
||||||
})}
|
|
||||||
onClick={() => setTheme('auto')}
|
|
||||||
>
|
|
||||||
auto
|
|
||||||
</button>
|
|
||||||
</li>
|
|
||||||
{THEMES.map((theme) => (
|
|
||||||
<li key={theme}>
|
|
||||||
<input
|
|
||||||
type="radio"
|
|
||||||
name="theme-dropdown"
|
|
||||||
className="theme-controller btn btn-sm btn-block btn-ghost justify-start"
|
|
||||||
aria-label={theme}
|
|
||||||
value={theme}
|
|
||||||
checked={selectedTheme === theme}
|
|
||||||
onChange={(e) => e.target.checked && setTheme(theme)}
|
|
||||||
/>
|
|
||||||
</li>
|
|
||||||
))}
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
}
|
|
@ -1,96 +0,0 @@
|
|||||||
import { useEffect, useState } from 'react';
|
|
||||||
import { classNames } from '../utils/misc';
|
|
||||||
import { Conversation } from '../utils/types';
|
|
||||||
import StorageUtils from '../utils/storage';
|
|
||||||
import { useNavigate, useParams } from 'react-router';
|
|
||||||
|
|
||||||
export default function Sidebar() {
|
|
||||||
const params = useParams();
|
|
||||||
const navigate = useNavigate();
|
|
||||||
|
|
||||||
const [conversations, setConversations] = useState<Conversation[]>([]);
|
|
||||||
const [currConv, setCurrConv] = useState<Conversation | null>(null);
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
StorageUtils.getOneConversation(params.convId ?? '').then(setCurrConv);
|
|
||||||
}, [params.convId]);
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
const handleConversationChange = async () => {
|
|
||||||
setConversations(await StorageUtils.getAllConversations());
|
|
||||||
};
|
|
||||||
StorageUtils.onConversationChanged(handleConversationChange);
|
|
||||||
handleConversationChange();
|
|
||||||
return () => {
|
|
||||||
StorageUtils.offConversationChanged(handleConversationChange);
|
|
||||||
};
|
|
||||||
}, []);
|
|
||||||
|
|
||||||
return (
|
|
||||||
<>
|
|
||||||
<input
|
|
||||||
id="toggle-drawer"
|
|
||||||
type="checkbox"
|
|
||||||
className="drawer-toggle"
|
|
||||||
defaultChecked
|
|
||||||
/>
|
|
||||||
|
|
||||||
<div className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
|
|
||||||
<label
|
|
||||||
htmlFor="toggle-drawer"
|
|
||||||
aria-label="close sidebar"
|
|
||||||
className="drawer-overlay"
|
|
||||||
></label>
|
|
||||||
<div className="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
|
|
||||||
<div className="flex flex-row items-center justify-between mb-4 mt-4">
|
|
||||||
<h2 className="font-bold ml-4">Conversations</h2>
|
|
||||||
|
|
||||||
{/* close sidebar button */}
|
|
||||||
<label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
|
|
||||||
<svg
|
|
||||||
xmlns="http://www.w3.org/2000/svg"
|
|
||||||
width="16"
|
|
||||||
height="16"
|
|
||||||
fill="currentColor"
|
|
||||||
className="bi bi-arrow-bar-left"
|
|
||||||
viewBox="0 0 16 16"
|
|
||||||
>
|
|
||||||
<path
|
|
||||||
fillRule="evenodd"
|
|
||||||
d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"
|
|
||||||
/>
|
|
||||||
</svg>
|
|
||||||
</label>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{/* list of conversations */}
|
|
||||||
<div
|
|
||||||
className={classNames({
|
|
||||||
'btn btn-ghost justify-start': true,
|
|
||||||
'btn-active': !currConv,
|
|
||||||
})}
|
|
||||||
onClick={() => navigate('/')}
|
|
||||||
>
|
|
||||||
+ New conversation
|
|
||||||
</div>
|
|
||||||
{conversations.map((conv) => (
|
|
||||||
<div
|
|
||||||
key={conv.id}
|
|
||||||
className={classNames({
|
|
||||||
'btn btn-ghost justify-start font-normal': true,
|
|
||||||
'btn-active': conv.id === currConv?.id,
|
|
||||||
})}
|
|
||||||
onClick={() => navigate(`/chat/${conv.id}`)}
|
|
||||||
dir="auto"
|
|
||||||
>
|
|
||||||
<span className="truncate">{conv.name}</span>
|
|
||||||
</div>
|
|
||||||
))}
|
|
||||||
<div className="text-center text-xs opacity-40 mt-auto mx-4">
|
|
||||||
Conversations are saved to browser's IndexedDB
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</>
|
|
||||||
);
|
|
||||||
}
|
|
@ -1,38 +0,0 @@
|
|||||||
export const XCloseButton: React.ElementType<
|
|
||||||
React.ClassAttributes<HTMLButtonElement> &
|
|
||||||
React.HTMLAttributes<HTMLButtonElement>
|
|
||||||
> = ({ className, ...props }) => (
|
|
||||||
<button className={`btn btn-square btn-sm ${className ?? ''}`} {...props}>
|
|
||||||
<svg
|
|
||||||
xmlns="http://www.w3.org/2000/svg"
|
|
||||||
className="h-6 w-6"
|
|
||||||
fill="none"
|
|
||||||
viewBox="0 0 24 24"
|
|
||||||
stroke="currentColor"
|
|
||||||
>
|
|
||||||
<path
|
|
||||||
strokeLinecap="round"
|
|
||||||
strokeLinejoin="round"
|
|
||||||
strokeWidth="2"
|
|
||||||
d="M6 18L18 6M6 6l12 12"
|
|
||||||
/>
|
|
||||||
</svg>
|
|
||||||
</button>
|
|
||||||
);
|
|
||||||
|
|
||||||
export const OpenInNewTab = ({
|
|
||||||
href,
|
|
||||||
children,
|
|
||||||
}: {
|
|
||||||
href: string;
|
|
||||||
children: string;
|
|
||||||
}) => (
|
|
||||||
<a
|
|
||||||
className="underline"
|
|
||||||
href={href}
|
|
||||||
target="_blank"
|
|
||||||
rel="noopener noreferrer"
|
|
||||||
>
|
|
||||||
{children}
|
|
||||||
</a>
|
|
||||||
);
|
|
@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
|
|||||||
auto generate = [&](const std::string & prompt) {
|
auto generate = [&](const std::string & prompt) {
|
||||||
std::string response;
|
std::string response;
|
||||||
|
|
||||||
const bool is_first = llama_kv_self_used_cells(ctx) == 0;
|
const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == 0;
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
||||||
@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
|
|||||||
while (true) {
|
while (true) {
|
||||||
// check if we have enough space in the context to evaluate this batch
|
// check if we have enough space in the context to evaluate this batch
|
||||||
int n_ctx = llama_n_ctx(ctx);
|
int n_ctx = llama_n_ctx(ctx);
|
||||||
int n_ctx_used = llama_kv_self_used_cells(ctx);
|
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0);
|
||||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||||
printf("\033[0m\n");
|
printf("\033[0m\n");
|
||||||
fprintf(stderr, "context size exceeded\n");
|
fprintf(stderr, "context size exceeded\n");
|
||||||
|
@ -84,13 +84,13 @@ int main(int argc, char ** argv) {
|
|||||||
model_params.n_gpu_layers = ngl;
|
model_params.n_gpu_layers = ngl;
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
|
|
||||||
// find the number of tokens in the prompt
|
// find the number of tokens in the prompt
|
||||||
|
@ -217,7 +217,7 @@ int main(int argc, char ** argv) {
|
|||||||
{
|
{
|
||||||
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
||||||
|
|
||||||
llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
|
llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, n_past, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||||
|
@ -142,6 +142,8 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto * mem_tgt = llama_get_memory(ctx_tgt);
|
||||||
|
auto * mem_dft = llama_get_memory(ctx_dft);
|
||||||
|
|
||||||
// Tokenize the prompt
|
// Tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
@ -420,14 +422,14 @@ int main(int argc, char ** argv) {
|
|||||||
{
|
{
|
||||||
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
||||||
|
|
||||||
llama_kv_self_seq_keep(ctx_dft, s_keep);
|
llama_memory_seq_keep(mem_dft, s_keep);
|
||||||
llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
llama_memory_seq_cp (mem_dft, s_keep, 0, -1, -1);
|
||||||
llama_kv_self_seq_keep(ctx_dft, 0);
|
llama_memory_seq_keep(mem_dft, 0);
|
||||||
|
|
||||||
llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
|
llama_memory_seq_rm (mem_tgt, s_keep, n_past_tgt, -1);
|
||||||
llama_kv_self_seq_keep(ctx_tgt, s_keep);
|
llama_memory_seq_keep(mem_tgt, s_keep);
|
||||||
llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
|
llama_memory_seq_cp (mem_tgt, s_keep, 0, -1, -1);
|
||||||
llama_kv_self_seq_keep(ctx_tgt, 0);
|
llama_memory_seq_keep(mem_tgt, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
@ -444,7 +446,7 @@ int main(int argc, char ** argv) {
|
|||||||
common_batch_clear(batch_dft);
|
common_batch_clear(batch_dft);
|
||||||
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
||||||
|
|
||||||
llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
llama_memory_seq_rm(mem_dft, 0, n_past_dft, -1);
|
||||||
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||||
llama_decode(ctx_dft, batch_dft);
|
llama_decode(ctx_dft, batch_dft);
|
||||||
|
|
||||||
@ -503,8 +505,8 @@ int main(int argc, char ** argv) {
|
|||||||
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
|
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
|
||||||
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
||||||
|
|
||||||
llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
llama_memory_seq_rm(mem_dft, n_seq_cur, -1, -1);
|
||||||
llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
llama_memory_seq_cp(mem_dft, s, n_seq_cur, -1, -1);
|
||||||
|
|
||||||
// all previous tokens from this branch are now also part of the new branch
|
// all previous tokens from this branch are now also part of the new branch
|
||||||
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
|
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
|
||||||
@ -585,9 +587,9 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// evaluate the target model on the drafted tokens
|
// evaluate the target model on the drafted tokens
|
||||||
{
|
{
|
||||||
llama_kv_self_seq_keep(ctx_tgt, 0);
|
llama_memory_seq_keep(mem_tgt, 0);
|
||||||
for (int s = 1; s < n_seq_dft; ++s) {
|
for (int s = 1; s < n_seq_dft; ++s) {
|
||||||
llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
|
llama_memory_seq_cp(mem_tgt, 0, s, -1, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user